Compare commits
240 Commits
1af02f4495
..
v0.9.0
| Author | SHA1 | Date | |
|---|---|---|---|
| ab7fee0ae7 | |||
| ed276813f0 | |||
| 02e4ef7544 | |||
| ddb46e16b6 | |||
| e8913943f9 | |||
| fb978ad10c | |||
| 9abdedf40a | |||
| 2e1961beee | |||
| e0989e1cef | |||
| fce7245a51 | |||
| 5154b24fab | |||
| 1cf9cb752f | |||
| d2ffc98f3c | |||
| 1fd9dce8a2 | |||
| 746324e65a | |||
| ede014e85b | |||
| 4594e563ef | |||
| db2fcdd52e | |||
| e2976a42e6 | |||
| 14be63510c | |||
| 70aa22e87e | |||
| 154b57a4cd | |||
| c5b29b88b9 | |||
| 1df072a211 | |||
| 2421d5d389 | |||
| 168059ae45 | |||
| c1426110e5 | |||
| 3922cbece2 | |||
| 6295faad64 | |||
| 2d9e53b025 | |||
| 0521a2169f | |||
| c98eb19adb | |||
| 2dd8f3c3be | |||
| 2f3292aebf | |||
| 04a413eb55 | |||
| 211f11e460 | |||
| 426b06d43d | |||
| 18affc1f16 | |||
| 53016aee93 | |||
| 9e044fd7b0 | |||
| 7c241c55d1 | |||
| e5f79902fd | |||
| 81f2852eb1 | |||
| 0407aa420b | |||
| 56108ffc33 | |||
| c75777b60f | |||
| 085fa9684b | |||
| 529104b8e4 | |||
| 2ba561410f | |||
| 8727d6bacc | |||
| e76a383813 | |||
| 93d857d995 | |||
| dafdfcda3f | |||
| c6fbe7c0e0 | |||
| a1d307fafa | |||
| 9712c65b04 | |||
| 45d4844937 | |||
| b2983aed52 | |||
| c1fd5a3526 | |||
| 4f66cc2b34 | |||
| deb8b874ca | |||
| 86fe569ea0 | |||
| 16c77a8cc5 | |||
| 4f94ddbbcb | |||
| d46adabeec | |||
| 595656ed59 | |||
| 85c62741b5 | |||
| bc5ce12957 | |||
| 38090dd457 | |||
| 350be3f19d | |||
| 9d7a714102 | |||
| a2355b2412 | |||
| bd310d918b | |||
| 4e95ad4c87 | |||
| 38683b4e64 | |||
| d830635a2e | |||
| 373d74cdaf | |||
| 6139795f71 | |||
| cbdaa4daeb | |||
| 24eecc1673 | |||
| 04dde93acd | |||
| 809c4ed910 | |||
| b25f96e465 | |||
| e0847517a8 | |||
| 9dbed025e0 | |||
| 35dee98cf9 | |||
| 5d8350132c | |||
| 5c6ac155eb | |||
| c710743231 | |||
| 5e655d756d | |||
| 9abe817aae | |||
| 4d469da0f8 | |||
| 4ef7486eef | |||
| e6c5dea6bf | |||
| 1ff0b2dc86 | |||
| 5031c888ed | |||
| d0baabc745 | |||
| 8a92fedba1 | |||
| 7c62d111d5 | |||
| b2dffb1d83 | |||
| db71e006bb | |||
| 2692c660c5 | |||
| fba76579bc | |||
| 24529d8fa7 | |||
| aa2d7db097 | |||
| 539b941db5 | |||
| bec7f6d2b9 | |||
| a781e95c94 | |||
| 727c610765 | |||
| 65a0134101 | |||
| c417b5e9ab | |||
| 4c108bb68a | |||
| f5e3bca6a2 | |||
| 13f58bd052 | |||
| 94149a7324 | |||
| d325a27439 | |||
| 8a42f20f8c | |||
| 1bf8c09808 | |||
| 2095505edd | |||
| ed6ccef31b | |||
| 8062db1f2f | |||
| 4c81ff3e7b | |||
| a46d906d27 | |||
| fd87218b3f | |||
| cd80be3b13 | |||
| a5a2cb91d0 | |||
| 7b1990cf11 | |||
| 18b0bf976d | |||
| c9b49637d1 | |||
| d02a093eeb | |||
| e6fc9e9963 | |||
| cdf88c6dc3 | |||
| 95ab3f4d16 | |||
| d6f3d84ba5 | |||
| 609aaf741d | |||
| e850f6f44c | |||
| 51a7ea302f | |||
| f94e8ec967 | |||
| adece5eb72 | |||
| 9ec69456fe | |||
| 5bcb20dfce | |||
| f103b14c62 | |||
| 6af5a945ce | |||
| e0eae0a96f | |||
| d6dcdd5ec4 | |||
| 5b4a590508 | |||
| 18a4f74a22 | |||
| aba0b7e177 | |||
| 14b703be58 | |||
| ae96983877 | |||
| 6f204a6877 | |||
| c5b52df7ed | |||
| e2d94bf3a2 | |||
| c5f401e99b | |||
| 69abc40786 | |||
| 35f07c3cee | |||
| a110e3c00c | |||
| 22adde36b3 | |||
| 57bf9690f2 | |||
| c1237583bd | |||
| 0c3c907de8 | |||
| e93eb2a060 | |||
| 485f4322cb | |||
| b24faf6de7 | |||
| 9b790bbade | |||
| 11cbc2fb7f | |||
| 5200e44536 | |||
| 84a8c060b6 | |||
| cfe25b9799 | |||
| f801fdf65b | |||
| 9f2cb18e42 | |||
| e73c4bd96c | |||
| bd460d7532 | |||
| 2ba2c9c7db | |||
| 380931b3a8 | |||
| d9c8da139c | |||
| 174bdae750 | |||
| b6f8de1dcc | |||
| 41c3ec7c6f | |||
| 8b57b8a06d | |||
| a4823193e7 | |||
| 5f2845c331 | |||
| e45f75598f | |||
| 9ac5088fde | |||
| 0b70da2955 | |||
| 54528b9b15 | |||
| 8d993ac77c | |||
| 27a995e812 | |||
| da9ed4c3d4 | |||
| 079b4bed70 | |||
| 84914fd6c5 | |||
| c019633b77 | |||
| d692272d10 | |||
| ec0bf0f6c3 | |||
| 0735038ea8 | |||
| e6657c23ff | |||
| e7eea7afac | |||
| e717b6998c | |||
| 49ecb7c771 | |||
| 6a171596f1 | |||
| 457a7e049c | |||
| 413d0bdb1b | |||
| 047c1d1912 | |||
| c46024c03a | |||
| f692ad592c | |||
| c5777122db | |||
| c1f85da55f | |||
| 8fb1c100fd | |||
| c6237d4004 | |||
| 608962441b | |||
| a086b0eb75 | |||
| aa9fc330fc | |||
| c8ead66f08 | |||
| 8aa635f0c1 | |||
| e6729a5a3d | |||
| cc9dcff816 | |||
| 9795492f2e | |||
| 86f7c17d9d | |||
| 55242caf58 | |||
| 8b7b1479a1 | |||
| cca525a04d | |||
| afce98f105 | |||
| 9798a2b5fe | |||
| 44feb708bc | |||
| 6cfbdfc7ab | |||
| 27086783da | |||
| b3b89045f2 | |||
| 8d8150ee6e | |||
| 51bbb555d4 | |||
| 8d5282a180 | |||
| 811157b4ce | |||
| 80a57b3b84 | |||
| a7c6a6e09c | |||
| 24ab071702 | |||
| 9cc0caff1e | |||
| df2c584b23 | |||
| f55747a281 | |||
| c821ec1fe0 | |||
| 25aa001135 | |||
| ab02869d82 |
@@ -1,32 +0,0 @@
|
|||||||
<!--
|
|
||||||
Thanks for the PR! A few quick checks before submitting:
|
|
||||||
|
|
||||||
* Did you open an issue first for non-trivial changes?
|
|
||||||
* `make lint test` is green locally?
|
|
||||||
* Commits are focused (one logical change per commit)?
|
|
||||||
* No `Co-Authored-By` trailers (repo policy)?
|
|
||||||
* No new dependencies without a one-line justification below?
|
|
||||||
-->
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
<!-- One paragraph: what changed and why. -->
|
|
||||||
|
|
||||||
## Test plan
|
|
||||||
|
|
||||||
<!-- Bullet list of what you actually ran. Be specific.
|
|
||||||
- `make test` → green
|
|
||||||
- Manually exercised the new flow at /hosts/{id}/foo
|
|
||||||
- Smoke env: enrolled a fresh host, ran a backup end-to-end
|
|
||||||
-->
|
|
||||||
|
|
||||||
## Notes for the reviewer
|
|
||||||
|
|
||||||
<!-- Anything the reviewer needs to know that isn't obvious from the
|
|
||||||
diff: related issue, follow-up work that's intentionally not
|
|
||||||
in this PR, deferred concerns, design alternatives considered
|
|
||||||
and rejected. -->
|
|
||||||
|
|
||||||
## Linked issues
|
|
||||||
|
|
||||||
<!-- "Closes #123" / "Refs #456" / "Part of P5-06" -->
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
---
|
|
||||||
name: Bug report
|
|
||||||
about: Something isn't behaving the way the docs / code suggest it should
|
|
||||||
title: "[bug] "
|
|
||||||
labels: bug
|
|
||||||
---
|
|
||||||
|
|
||||||
## What happened
|
|
||||||
|
|
||||||
<!-- A clear description of the actual behaviour. Include the exact
|
|
||||||
UI surface, API endpoint, or CLI invocation involved. -->
|
|
||||||
|
|
||||||
## What you expected
|
|
||||||
|
|
||||||
<!-- What you thought would happen, and where that expectation came from
|
|
||||||
(docs page, command output, prior behaviour). -->
|
|
||||||
|
|
||||||
## Steps to reproduce
|
|
||||||
|
|
||||||
1.
|
|
||||||
2.
|
|
||||||
3.
|
|
||||||
|
|
||||||
## Environment
|
|
||||||
|
|
||||||
- restic-manager server version: <!-- `restic-manager-server --version` or footer of the UI -->
|
|
||||||
- Agent version (if relevant): <!-- `restic-manager-agent --version` -->
|
|
||||||
- restic version on affected host: <!-- `restic version` -->
|
|
||||||
- Host OS: <!-- e.g. "Ubuntu 22.04 amd64" or "Windows Server 2022" -->
|
|
||||||
- How was the server installed: <!-- docker compose / source build / other -->
|
|
||||||
|
|
||||||
## Logs / output
|
|
||||||
|
|
||||||
<details><summary>Server log (sanitised)</summary>
|
|
||||||
|
|
||||||
```
|
|
||||||
<!-- paste relevant lines; redact tokens, passwords, repo URLs -->
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details><summary>Agent log (sanitised)</summary>
|
|
||||||
|
|
||||||
```
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## Anything else
|
|
||||||
|
|
||||||
<!-- Screenshots, related issues, recent changes you made before the
|
|
||||||
bug appeared, anything that might help. -->
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
---
|
|
||||||
name: Feature request
|
|
||||||
about: Suggest a new capability or change to existing behaviour
|
|
||||||
title: "[feature] "
|
|
||||||
labels: enhancement
|
|
||||||
---
|
|
||||||
|
|
||||||
## What you're trying to do
|
|
||||||
|
|
||||||
<!-- Describe the use case, not the proposed solution. Who is the
|
|
||||||
operator, what are they trying to accomplish, and what's
|
|
||||||
blocking them today? -->
|
|
||||||
|
|
||||||
## Why the current behaviour falls short
|
|
||||||
|
|
||||||
<!-- What does the system do today, and where does it stop short of
|
|
||||||
the use case above? -->
|
|
||||||
|
|
||||||
## Proposed direction (optional)
|
|
||||||
|
|
||||||
<!-- If you have a specific design in mind, describe it. Skip this
|
|
||||||
section if you'd rather leave it to the maintainer. -->
|
|
||||||
|
|
||||||
## Scope check
|
|
||||||
|
|
||||||
- [ ] I've read [`spec.md`](../spec.md) §2 (Goals & Non-Goals).
|
|
||||||
- [ ] This isn't already on the roadmap in [`tasks.md`](../tasks.md).
|
|
||||||
- [ ] This fits the project's "small fleet, one person operating"
|
|
||||||
target rather than enterprise / multi-tenant / SaaS use cases.
|
|
||||||
|
|
||||||
## Anything else
|
|
||||||
|
|
||||||
<!-- Related restic features, prior art in similar tools, links to
|
|
||||||
discussions you've had elsewhere. -->
|
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
# P5-06 — End-to-end test suite.
|
|
||||||
#
|
|
||||||
# Spec : docs/superpowers/specs/2026-05-07-p5-oss-readiness-design.md
|
|
||||||
# Stack: e2e/compose.e2e.yml (server + agent + rest-server + playwright)
|
|
||||||
# Tests: e2e/playwright/tests/*.spec.ts
|
|
||||||
#
|
|
||||||
# Triggered on every PR into main and on workflow_dispatch. Runs
|
|
||||||
# longer than the unit-test workflow (~3-4 minutes for a clean run);
|
|
||||||
# kept separate so a slow e2e doesn't block the fast lint/test loop.
|
|
||||||
#
|
|
||||||
# Networking note: every interaction with the server (health probe,
|
|
||||||
# Playwright) happens from a container on the compose `rmnet`
|
|
||||||
# network, addressing the server as `http://server:8080`. We can't
|
|
||||||
# rely on `127.0.0.1:8080` because Gitea's runner executes steps
|
|
||||||
# inside its own container, where compose's host port-publish is
|
|
||||||
# not visible.
|
|
||||||
|
|
||||||
name: e2e
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
branches: [main]
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
e2e:
|
|
||||||
name: Playwright vs docker-compose
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
timeout-minutes: 15
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Build the e2e stack
|
|
||||||
run: docker compose -f e2e/compose.e2e.yml build
|
|
||||||
|
|
||||||
- name: Bring up the stack
|
|
||||||
run: docker compose -f e2e/compose.e2e.yml up -d server rest-server source-fixture
|
|
||||||
|
|
||||||
- name: Wait for server health
|
|
||||||
run: |
|
|
||||||
set -eu
|
|
||||||
for i in $(seq 1 30); do
|
|
||||||
if docker run --rm --network e2e_rmnet curlimages/curl:8.10.1 \
|
|
||||||
-fsS http://server:8080/api/version >/dev/null 2>&1; then
|
|
||||||
echo "server up"; exit 0
|
|
||||||
fi
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "server didn't come up"; docker compose -f e2e/compose.e2e.yml logs server; exit 1
|
|
||||||
|
|
||||||
- name: Capture bootstrap token from server logs
|
|
||||||
id: bootstrap
|
|
||||||
run: |
|
|
||||||
set -eu
|
|
||||||
for i in $(seq 1 15); do
|
|
||||||
line=$(docker compose -f e2e/compose.e2e.yml logs server 2>&1 | grep -E 'bootstrap token' -A2 | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1 || true)
|
|
||||||
if [ -n "$line" ]; then
|
|
||||||
echo "RM_BOOTSTRAP_TOKEN=$line" >> "$GITHUB_ENV"
|
|
||||||
echo "got bootstrap token (${#line} chars)"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
echo "bootstrap token not found in logs"
|
|
||||||
docker compose -f e2e/compose.e2e.yml logs server
|
|
||||||
exit 1
|
|
||||||
|
|
||||||
- name: Start the agent
|
|
||||||
run: docker compose -f e2e/compose.e2e.yml up -d agent
|
|
||||||
|
|
||||||
- name: Prepare report mounts
|
|
||||||
run: |
|
|
||||||
mkdir -p e2e/playwright/playwright-report e2e/playwright/test-results
|
|
||||||
chmod -R a+rwX e2e/playwright/playwright-report e2e/playwright/test-results
|
|
||||||
|
|
||||||
- name: Run Playwright tests
|
|
||||||
env:
|
|
||||||
RM_BOOTSTRAP_TOKEN: ${{ env.RM_BOOTSTRAP_TOKEN }}
|
|
||||||
run: docker compose -f e2e/compose.e2e.yml run --rm playwright
|
|
||||||
|
|
||||||
- name: Compose logs (on failure)
|
|
||||||
if: failure()
|
|
||||||
run: |
|
|
||||||
docker compose -f e2e/compose.e2e.yml logs --tail=200 server
|
|
||||||
docker compose -f e2e/compose.e2e.yml logs --tail=200 agent
|
|
||||||
docker compose -f e2e/compose.e2e.yml logs --tail=200 rest-server
|
|
||||||
|
|
||||||
- name: Upload Playwright report (on failure)
|
|
||||||
if: failure()
|
|
||||||
uses: actions/upload-artifact@v3
|
|
||||||
with:
|
|
||||||
name: playwright-report
|
|
||||||
path: e2e/playwright/playwright-report
|
|
||||||
retention-days: 7
|
|
||||||
|
|
||||||
- name: Tear down
|
|
||||||
if: always()
|
|
||||||
run: docker compose -f e2e/compose.e2e.yml down -v
|
|
||||||
@@ -2,10 +2,6 @@
|
|||||||
/bin/
|
/bin/
|
||||||
/dist/
|
/dist/
|
||||||
|
|
||||||
# Generated mdBook output (source under docs/book/src is committed,
|
|
||||||
# the rendered book/ directory is not).
|
|
||||||
/docs/book/book/
|
|
||||||
|
|
||||||
# Local data / runtime state
|
# Local data / runtime state
|
||||||
/data/
|
/data/
|
||||||
/certs/
|
/certs/
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ but the **agent** is fetched by the install script from the server's
|
|||||||
**install script** are fetched from `<DataDir>/install/`. Plain
|
**install script** are fetched from `<DataDir>/install/`. Plain
|
||||||
`make build` doesn't touch any of those — the source-of-truth files
|
`make build` doesn't touch any of those — the source-of-truth files
|
||||||
in the working tree (`deploy/install/*`, `bin/restic-manager-agent`)
|
in the working tree (`deploy/install/*`, `bin/restic-manager-agent`)
|
||||||
must be copied into `$HOME/smoke/data/...` *and* the running agent
|
must be copied into `/tmp/rm-smoke/data/...` *and* the running agent
|
||||||
on this dev host needs replacing if the change touches agent code or
|
on this dev host needs replacing if the change touches agent code or
|
||||||
the unit file.
|
the unit file.
|
||||||
|
|
||||||
@@ -53,13 +53,13 @@ asking the operator to test.**
|
|||||||
```sh
|
```sh
|
||||||
# 1. Restage what the install script serves (binary + unit + script).
|
# 1. Restage what the install script serves (binary + unit + script).
|
||||||
cp bin/restic-manager-agent \
|
cp bin/restic-manager-agent \
|
||||||
$HOME/smoke/data/agent-binaries/restic-manager-agent-linux-amd64
|
/tmp/rm-smoke/data/agent-binaries/restic-manager-agent-linux-amd64
|
||||||
cp deploy/install/install.sh \
|
cp deploy/install/install.sh \
|
||||||
$HOME/smoke/data/install/install.sh
|
/tmp/rm-smoke/data/install/install.sh
|
||||||
cp deploy/install/install.ps1 \
|
cp deploy/install/install.ps1 \
|
||||||
$HOME/smoke/data/install/install.ps1
|
/tmp/rm-smoke/data/install/install.ps1
|
||||||
cp deploy/install/restic-manager-agent.service \
|
cp deploy/install/restic-manager-agent.service \
|
||||||
$HOME/smoke/data/install/restic-manager-agent.service
|
/tmp/rm-smoke/data/install/restic-manager-agent.service
|
||||||
|
|
||||||
# 2. Replace the running agent on this dev box and restart the
|
# 2. Replace the running agent on this dev box and restart the
|
||||||
# service. Skip only when the change is server-side only AND
|
# service. Skip only when the change is server-side only AND
|
||||||
@@ -74,36 +74,15 @@ sudo -n systemctl restart restic-manager-agent
|
|||||||
# 3. The server runs from the working tree; restart it manually
|
# 3. The server runs from the working tree; restart it manually
|
||||||
# after a build that touches server code:
|
# after a build that touches server code:
|
||||||
pkill -f restic-manager-server
|
pkill -f restic-manager-server
|
||||||
RM_LISTEN=:8080 RM_DATA_DIR=$HOME/smoke/data \
|
RM_LISTEN=:8080 RM_DATA_DIR=/tmp/rm-smoke/data \
|
||||||
RM_BASE_URL=http://127.0.0.1:8080 \
|
RM_BASE_URL=http://127.0.0.1:8080 \
|
||||||
RM_SECRET_KEY_FILE=$HOME/smoke/data/secret.key \
|
RM_SECRET_KEY_FILE=/tmp/rm-smoke/data/secret.key \
|
||||||
RM_COOKIE_SECURE=false \
|
RM_COOKIE_SECURE=false \
|
||||||
./bin/restic-manager-server >> $HOME/smoke/server.log 2>&1 &
|
./bin/restic-manager-server >> /tmp/rm-smoke/server.log 2>&1 &
|
||||||
```
|
```
|
||||||
|
|
||||||
## Smoke server: use the Make targets, not raw `nohup`
|
A `make smoke-deploy` target that bundles all of this would be a
|
||||||
|
good follow-up.
|
||||||
The smoke server runs as a transient `systemd --user` unit named
|
|
||||||
`restic-manager-smoke.service` so it survives any sandbox or
|
|
||||||
process-group boundary that would otherwise SIGTERM a backgrounded
|
|
||||||
process. Use the Make targets:
|
|
||||||
|
|
||||||
```
|
|
||||||
make smoke-restart # rebuild server + (re)launch as systemd --user unit
|
|
||||||
make smoke-status # systemctl --user status
|
|
||||||
make smoke-logs # tail $HOME/smoke/server.log
|
|
||||||
make smoke-stop # stop the unit
|
|
||||||
make smoke-deploy # full rebuild + restage agent assets + restart
|
|
||||||
```
|
|
||||||
|
|
||||||
`./bin/restic-manager-server &` from inside a Bash tool call gets
|
|
||||||
reaped when the tool exits — don't do that. If the unit fails to
|
|
||||||
start: `systemctl --user status restic-manager-smoke` and
|
|
||||||
`$HOME/smoke/server.log` have the diagnosis.
|
|
||||||
|
|
||||||
`smoke-deploy` does NOT touch `/usr/local/bin/restic-manager-agent`
|
|
||||||
on this dev box; if your change requires the live agent here to
|
|
||||||
update, run the agent restage block above by hand.
|
|
||||||
|
|
||||||
## Migrations: prefer column-level ALTERs over table rebuilds
|
## Migrations: prefer column-level ALTERs over table rebuilds
|
||||||
|
|
||||||
|
|||||||
@@ -1,69 +0,0 @@
|
|||||||
# Code of Conduct
|
|
||||||
|
|
||||||
restic-manager is a small project run by one person. This Code of
|
|
||||||
Conduct sets out the basic expectations for participating in the
|
|
||||||
project's issue tracker, pull requests, and any other community
|
|
||||||
spaces (chat, mailing lists) we may run in future.
|
|
||||||
|
|
||||||
## Expected behaviour
|
|
||||||
|
|
||||||
- **Be civil.** Disagreement is fine; rudeness is not. The same
|
|
||||||
comment can usually be made without making it personal.
|
|
||||||
- **Assume good faith.** People asking what feels like a basic
|
|
||||||
question may be new to the project. People proposing what feels
|
|
||||||
like a duplicate idea may not have seen the prior discussion.
|
|
||||||
Point them to the right place politely.
|
|
||||||
- **Stay on topic.** Issue threads are for the issue. Tangential
|
|
||||||
conversations belong in their own thread.
|
|
||||||
- **Acknowledge the project's scope.** restic-manager is
|
|
||||||
intentionally small in scope (see `spec.md` §2). Reasonable
|
|
||||||
feature suggestions may still be declined for fit reasons.
|
|
||||||
|
|
||||||
## Unacceptable behaviour
|
|
||||||
|
|
||||||
- Harassment, threats, or insults — public or private.
|
|
||||||
- Discriminatory comments based on age, body size, disability,
|
|
||||||
ethnicity, gender identity or expression, level of experience,
|
|
||||||
nationality, personal appearance, race, religion, sexual identity
|
|
||||||
or orientation.
|
|
||||||
- Sustained disruption — derailing threads, ignoring repeated
|
|
||||||
requests to take a discussion elsewhere, brigading.
|
|
||||||
- Publishing other people's private information without permission.
|
|
||||||
|
|
||||||
## Reporting
|
|
||||||
|
|
||||||
If someone in the project's spaces is behaving in a way that
|
|
||||||
breaches this Code of Conduct, contact the maintainer directly
|
|
||||||
through the contact details on their Gitea profile, or via the
|
|
||||||
private security disclosure path documented in
|
|
||||||
[SECURITY.md](./SECURITY.md). Reports stay confidential.
|
|
||||||
|
|
||||||
The maintainer will review the report, gather context if needed,
|
|
||||||
and respond. Possible outcomes include a private warning, a public
|
|
||||||
clarification of expectations, a temporary or permanent ban from
|
|
||||||
project spaces, or no action if the report doesn't hold up.
|
|
||||||
|
|
||||||
There is no formal appeals process — this is a one-person project,
|
|
||||||
not a foundation. If you think a decision was wrong you can say
|
|
||||||
so, in writing, to the maintainer; that's it.
|
|
||||||
|
|
||||||
## Scope
|
|
||||||
|
|
||||||
This Code of Conduct applies to interactions in any space the
|
|
||||||
project owns or operates: the Gitea repository (issues, pull
|
|
||||||
requests, discussions, wiki), any chat channels we publish, and
|
|
||||||
any conferences or events the project is officially represented at.
|
|
||||||
|
|
||||||
It does not apply to:
|
|
||||||
|
|
||||||
- Forks of the project that aren't being submitted back upstream.
|
|
||||||
- Conversations between contributors that don't reference the
|
|
||||||
project.
|
|
||||||
- Public criticism of the project itself.
|
|
||||||
|
|
||||||
## Acknowledgement
|
|
||||||
|
|
||||||
This document borrows shape and language from the
|
|
||||||
[Contributor Covenant](https://www.contributor-covenant.org/) v2.1
|
|
||||||
but is intentionally shorter and adapted to the project's
|
|
||||||
single-maintainer reality.
|
|
||||||
+21
-159
@@ -1,168 +1,30 @@
|
|||||||
# Contributing to restic-manager
|
# Contributing
|
||||||
|
|
||||||
Thanks for your interest in restic-manager. This document covers how
|
Thanks for your interest in contributing to restic-manager.
|
||||||
to set up a development environment, the conventions the project
|
|
||||||
follows, and how patches make it from your machine into `main`.
|
|
||||||
|
|
||||||
## Project status and scope
|
> This is a placeholder. The project is in pre-alpha (Phase 1 / MVP). A
|
||||||
|
> full contributor guide will land alongside the Phase 5 OSS-readiness
|
||||||
|
> work — see [`tasks.md`](./tasks.md) P5-02. Until then the notes below
|
||||||
|
> apply.
|
||||||
|
|
||||||
restic-manager is in pre-1.0. Core functionality (Phases 0–4) is
|
## Before opening a PR
|
||||||
landed; OSS-readiness polish is in progress. The top of
|
|
||||||
[`tasks.md`](./tasks.md) tracks what's next; [`spec.md`](./spec.md)
|
|
||||||
is the canonical design doc and the source of truth for any
|
|
||||||
"why is it built this way" question.
|
|
||||||
|
|
||||||
The project is **single-maintainer, hobbyist-scale, and licensed
|
1. Open an issue first for non-trivial changes — the design is still
|
||||||
under [PolyForm Noncommercial 1.0.0](./LICENSE)**. That has two
|
moving (see [`spec.md`](./spec.md)) and unsolicited large PRs may
|
||||||
practical implications:
|
conflict with in-flight work.
|
||||||
|
2. `make lint test` should pass.
|
||||||
|
3. Match the existing code style — `gofumpt`, `goimports`, no comments
|
||||||
|
that just restate what the code does.
|
||||||
|
4. Keep commits focused; one logical change per commit.
|
||||||
|
|
||||||
1. Big PRs without prior discussion may be declined for fit
|
## Reporting security issues
|
||||||
reasons even when they're correct — opening an issue first lets
|
|
||||||
us check alignment cheaply.
|
|
||||||
2. Commercial use is not permitted by the license. Bug reports and
|
|
||||||
patches from operators of personal/community deployments are
|
|
||||||
very welcome.
|
|
||||||
|
|
||||||
## Getting started
|
Please do **not** open a public issue for security problems. A
|
||||||
|
`SECURITY.md` with a private disclosure path will be added in Phase 5
|
||||||
### Prerequisites
|
(P5-05). Until then, contact the repository owner directly via the
|
||||||
|
contact details on their gitea profile.
|
||||||
- Go 1.25 or newer (`go.mod` is the source of truth)
|
|
||||||
- `make`
|
|
||||||
- For the front-end CSS bundle: nothing extra — `make build`
|
|
||||||
downloads a pinned `tailwindcss` standalone binary into `bin/`.
|
|
||||||
- For the docs site: nothing extra — `make docs` does the same trick
|
|
||||||
with `mdbook`.
|
|
||||||
- For end-to-end tests: Docker + Docker Compose, plus `npx` for
|
|
||||||
Playwright.
|
|
||||||
|
|
||||||
### One-time setup
|
|
||||||
|
|
||||||
```sh
|
|
||||||
git clone https://gitea.dcglab.co.uk/steve/restic-manager.git
|
|
||||||
cd restic-manager
|
|
||||||
make build # compiles bin/restic-manager-{server,agent}
|
|
||||||
make test # full unit + integration test sweep
|
|
||||||
make lint # gofumpt + goimports + golangci-lint
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running locally
|
|
||||||
|
|
||||||
For most development, the [smoke environment](./docs/e2e-smoke.md)
|
|
||||||
is the path of least resistance:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
make smoke-restart # rebuilds, launches as a systemd --user unit
|
|
||||||
make smoke-logs # tail of the server log
|
|
||||||
```
|
|
||||||
|
|
||||||
Then point a browser at `http://127.0.0.1:8080`. The first run
|
|
||||||
prints a one-time bootstrap token to the log; use it to create the
|
|
||||||
admin user.
|
|
||||||
|
|
||||||
## Code conventions
|
|
||||||
|
|
||||||
### Style
|
|
||||||
|
|
||||||
- `gofumpt` for formatting; `goimports` for import grouping.
|
|
||||||
Both run via the pre-commit hook in this repo.
|
|
||||||
- `golangci-lint` with `.golangci.yml` defaults; CI rejects on lint
|
|
||||||
errors.
|
|
||||||
- UK English in identifiers, comments, log messages, and UI strings
|
|
||||||
(the misspell linter is configured for the UK locale — see
|
|
||||||
P3-X5 for the original sweep).
|
|
||||||
- Comments explain **why**, not what; avoid restating the code.
|
|
||||||
A surprising invariant or an external constraint is worth
|
|
||||||
writing down. "Adds 1 to x" is not.
|
|
||||||
- `slog` for structured logs. Never log secrets — and especially
|
|
||||||
never the merged-creds rest-server URL (see [`CLAUDE.md`](./CLAUDE.md)).
|
|
||||||
|
|
||||||
### File and package layout
|
|
||||||
|
|
||||||
- `cmd/server` and `cmd/agent` are the two binary entry points.
|
|
||||||
- `internal/` holds everything that's not part of the public Go
|
|
||||||
API (which is none of it — restic-manager isn't a library).
|
|
||||||
- Per-feature packages live under `internal/server/...` for the
|
|
||||||
control plane and `internal/agent/...` for the agent.
|
|
||||||
- `web/templates/` are HTML templates rendered with the standard
|
|
||||||
library; embedded via `web.FS`.
|
|
||||||
|
|
||||||
### Tests
|
|
||||||
|
|
||||||
- Unit tests live alongside the code as `*_test.go`. Use the
|
|
||||||
in-process sqlite store (`store.Open(":memory:")`) when you need
|
|
||||||
state — there is no test mock layer to maintain.
|
|
||||||
- HTTP handlers test through `httptest.NewServer` against the real
|
|
||||||
router; see `internal/server/http/auth_test.go` for the canonical
|
|
||||||
fixture pattern.
|
|
||||||
- End-to-end tests live in `e2e/` and run against a Docker Compose
|
|
||||||
stack. See [`docs/e2e.md`](./docs/e2e.md).
|
|
||||||
|
|
||||||
### Database migrations
|
|
||||||
|
|
||||||
- Migrations are hand-rolled SQL in `internal/store/migrations/`
|
|
||||||
and embedded via `embed.FS`.
|
|
||||||
- Prefer column-level `ALTER TABLE` over rebuilds — see
|
|
||||||
[`CLAUDE.md`](./CLAUDE.md) "Migrations" section for the FK-cascade
|
|
||||||
trap that bit migration 0007's first draft.
|
|
||||||
|
|
||||||
## Workflow
|
|
||||||
|
|
||||||
### Before opening a PR
|
|
||||||
|
|
||||||
1. **Open an issue first** for non-trivial changes. The design is
|
|
||||||
still moving; an issue lets us agree on direction cheaply.
|
|
||||||
2. Run `make lint test` locally — both must pass.
|
|
||||||
3. Match existing code style (see above).
|
|
||||||
4. Keep commits focused: one logical change per commit. Imperative
|
|
||||||
subject lines, body explaining why if it isn't obvious.
|
|
||||||
5. Don't add `Co-Authored-By` trailers — repo policy. If you used
|
|
||||||
AI assistance in writing the patch, that's fine; we just don't
|
|
||||||
pollute every commit message with attribution boilerplate.
|
|
||||||
|
|
||||||
### Pull requests
|
|
||||||
|
|
||||||
PRs target `main`. CI runs lint + tests on Linux amd64/arm64 and
|
|
||||||
Windows amd64; all three must be green to merge. Squash-merge is
|
|
||||||
the default; the PR title becomes the merge-commit subject, so
|
|
||||||
keep it short and informative.
|
|
||||||
|
|
||||||
The PR template asks for:
|
|
||||||
|
|
||||||
- A short description of what changed and why.
|
|
||||||
- A test plan (commands run, scenarios verified).
|
|
||||||
- Anything reviewers need to know to assess the change (related
|
|
||||||
issue, follow-up work, deferred concerns).
|
|
||||||
|
|
||||||
### Reporting bugs
|
|
||||||
|
|
||||||
Open an issue with:
|
|
||||||
|
|
||||||
- restic-manager version (`server --version`) and agent version.
|
|
||||||
- restic version on the affected host.
|
|
||||||
- Steps to reproduce.
|
|
||||||
- Server and agent logs (sanitise any tokens before pasting).
|
|
||||||
|
|
||||||
Security-sensitive bugs go through the [SECURITY.md](./SECURITY.md)
|
|
||||||
disclosure path instead — please don't open a public issue for
|
|
||||||
them.
|
|
||||||
|
|
||||||
### Suggesting features
|
|
||||||
|
|
||||||
Open an issue describing the use case (not just the proposed
|
|
||||||
solution). The roadmap in `tasks.md` shows where the project is
|
|
||||||
heading; if the suggestion fits a future phase we'll wire it in
|
|
||||||
there. If it falls outside the project's scope (multi-tenancy, SaaS,
|
|
||||||
non-restic backends — see `spec.md` §2 non-goals) we'll say so
|
|
||||||
early to save your time.
|
|
||||||
|
|
||||||
## Code of conduct
|
|
||||||
|
|
||||||
Project participation is governed by [CODE_OF_CONDUCT.md](./CODE_OF_CONDUCT.md).
|
|
||||||
The short version: be civil; assume good faith; harassment is not
|
|
||||||
tolerated.
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
By contributing you agree that your contributions are licensed
|
By contributing you agree that your contributions are licensed under
|
||||||
under the [PolyForm Noncommercial 1.0.0](./LICENSE) license.
|
the [PolyForm Noncommercial 1.0.0](./LICENSE) license.
|
||||||
|
|||||||
@@ -7,9 +7,7 @@ AGENT_BIN := $(BIN_DIR)/restic-manager-agent
|
|||||||
VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev)
|
VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev)
|
||||||
COMMIT ?= $(shell git rev-parse HEAD 2>/dev/null || echo none)
|
COMMIT ?= $(shell git rev-parse HEAD 2>/dev/null || echo none)
|
||||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||||
VERSION_PKG := gitea.dcglab.co.uk/steve/restic-manager/internal/version
|
LDFLAGS := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE)
|
||||||
LDFLAGS := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE) \
|
|
||||||
-X $(VERSION_PKG).Version=$(VERSION) -X $(VERSION_PKG).Commit=$(COMMIT)
|
|
||||||
GOFLAGS := -trimpath
|
GOFLAGS := -trimpath
|
||||||
DOCKER_IMAGE ?= gitea.dcglab.co.uk/steve/restic-manager
|
DOCKER_IMAGE ?= gitea.dcglab.co.uk/steve/restic-manager
|
||||||
DOCKER_TAG ?= dev
|
DOCKER_TAG ?= dev
|
||||||
@@ -24,29 +22,7 @@ TAILWIND_URL := https://github.com/tailwindlabs/tailwindcss/releases/downlo
|
|||||||
TAILWIND_INPUT := web/styles/input.css
|
TAILWIND_INPUT := web/styles/input.css
|
||||||
TAILWIND_OUTPUT := web/static/css/styles.css
|
TAILWIND_OUTPUT := web/static/css/styles.css
|
||||||
|
|
||||||
# mdBook for the docs site (P5-01). Single static binary, no
|
.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch setup hooks
|
||||||
# Rust toolchain — same pattern as Tailwind.
|
|
||||||
MDBOOK_VERSION ?= v0.4.51
|
|
||||||
MDBOOK_OS := $(shell uname -s | tr A-Z a-z)
|
|
||||||
MDBOOK_TRIPLE := $(shell uname -m)-unknown-$(if $(filter darwin,$(MDBOOK_OS)),apple-darwin,linux-gnu)
|
|
||||||
MDBOOK_BIN := $(BIN_DIR)/mdbook
|
|
||||||
MDBOOK_TARBALL := mdbook-$(MDBOOK_VERSION)-$(MDBOOK_TRIPLE).tar.gz
|
|
||||||
MDBOOK_URL := https://github.com/rust-lang/mdBook/releases/download/$(MDBOOK_VERSION)/$(MDBOOK_TARBALL)
|
|
||||||
DOCS_BOOK_DIR := docs/book
|
|
||||||
DOCS_BOOK_OUT := $(DOCS_BOOK_DIR)/book
|
|
||||||
|
|
||||||
.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch docs docs-watch setup hooks smoke-restart smoke-stop smoke-status smoke-logs smoke-deploy
|
|
||||||
|
|
||||||
# ---- smoke-env tooling -------------------------------------------------
|
|
||||||
# The smoke server runs as a transient user-systemd unit so it survives
|
|
||||||
# bash-tool boundaries and reboots-of-the-shell. Use `make smoke-restart`
|
|
||||||
# any time you've rebuilt the server. `make smoke-deploy` is the full
|
|
||||||
# rebuild + restage + restart workflow described in CLAUDE.md.
|
|
||||||
SMOKE_UNIT := restic-manager-smoke
|
|
||||||
SMOKE_DATA_DIR := $(HOME)/smoke/data
|
|
||||||
SMOKE_LOG_FILE := $(HOME)/smoke/server.log
|
|
||||||
SMOKE_BASE_URL := http://127.0.0.1:8080
|
|
||||||
SMOKE_LISTEN := :8080
|
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN{FS=":.*?## "};{printf " \033[36m%-14s\033[0m %s\n",$$1,$$2}'
|
@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN{FS=":.*?## "};{printf " \033[36m%-14s\033[0m %s\n",$$1,$$2}'
|
||||||
@@ -71,18 +47,6 @@ tailwind-watch: $(TAILWIND_BIN) ## Watch and rebuild on every save
|
|||||||
@mkdir -p $$(dirname $(TAILWIND_OUTPUT))
|
@mkdir -p $$(dirname $(TAILWIND_OUTPUT))
|
||||||
$(TAILWIND_BIN) -c tailwind.config.js -i $(TAILWIND_INPUT) -o $(TAILWIND_OUTPUT) --watch
|
$(TAILWIND_BIN) -c tailwind.config.js -i $(TAILWIND_INPUT) -o $(TAILWIND_OUTPUT) --watch
|
||||||
|
|
||||||
$(MDBOOK_BIN):
|
|
||||||
@mkdir -p $(BIN_DIR)
|
|
||||||
@echo "==> downloading mdbook $(MDBOOK_VERSION) ($(MDBOOK_TRIPLE))"
|
|
||||||
curl -fsSL "$(MDBOOK_URL)" | tar -xz -C $(BIN_DIR) mdbook
|
|
||||||
@chmod +x $@
|
|
||||||
|
|
||||||
docs: $(MDBOOK_BIN) ## Build the docs/book/ mdBook site into docs/book/book/
|
|
||||||
$(MDBOOK_BIN) build $(DOCS_BOOK_DIR)
|
|
||||||
|
|
||||||
docs-watch: $(MDBOOK_BIN) ## Serve the docs site at http://127.0.0.1:3000 with live reload
|
|
||||||
$(MDBOOK_BIN) serve $(DOCS_BOOK_DIR) -n 127.0.0.1 -p 3000
|
|
||||||
|
|
||||||
agent: ## Build the agent binary
|
agent: ## Build the agent binary
|
||||||
@mkdir -p $(BIN_DIR)
|
@mkdir -p $(BIN_DIR)
|
||||||
CGO_ENABLED=0 go build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(AGENT_BIN) ./cmd/agent
|
CGO_ENABLED=0 go build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(AGENT_BIN) ./cmd/agent
|
||||||
@@ -113,7 +77,7 @@ tidy: ## go mod tidy
|
|||||||
go mod tidy
|
go mod tidy
|
||||||
|
|
||||||
clean: ## Remove build artifacts
|
clean: ## Remove build artifacts
|
||||||
rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT) $(DOCS_BOOK_OUT)
|
rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT)
|
||||||
|
|
||||||
run-server: server ## Build and run the server
|
run-server: server ## Build and run the server
|
||||||
$(SERVER_BIN)
|
$(SERVER_BIN)
|
||||||
@@ -128,48 +92,6 @@ docker: ## Build the server Docker image
|
|||||||
--build-arg DATE=$(DATE) \
|
--build-arg DATE=$(DATE) \
|
||||||
-t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
-t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
||||||
|
|
||||||
smoke-restart: server ## (Re)start the smoke server as a transient user-systemd unit
|
|
||||||
@systemctl --user reset-failed $(SMOKE_UNIT) >/dev/null 2>&1 || true
|
|
||||||
@systemctl --user stop $(SMOKE_UNIT) >/dev/null 2>&1 || true
|
|
||||||
@echo "==> launching $(SMOKE_UNIT)"
|
|
||||||
systemd-run --user --unit=$(SMOKE_UNIT) \
|
|
||||||
--setenv=RM_LISTEN=$(SMOKE_LISTEN) \
|
|
||||||
--setenv=RM_DATA_DIR=$(SMOKE_DATA_DIR) \
|
|
||||||
--setenv=RM_BASE_URL=$(SMOKE_BASE_URL) \
|
|
||||||
--setenv=RM_SECRET_KEY_FILE=$(SMOKE_DATA_DIR)/secret.key \
|
|
||||||
--setenv=RM_COOKIE_SECURE=false \
|
|
||||||
--property=StandardOutput=append:$(SMOKE_LOG_FILE) \
|
|
||||||
--property=StandardError=append:$(SMOKE_LOG_FILE) \
|
|
||||||
--property=Restart=on-failure \
|
|
||||||
$(PWD)/$(SERVER_BIN)
|
|
||||||
@for i in 1 2 3 4 5; do \
|
|
||||||
curl -fsS -o /dev/null $(SMOKE_BASE_URL)/api/version 2>/dev/null && \
|
|
||||||
{ echo "==> smoke server up: $$(curl -s $(SMOKE_BASE_URL)/api/version)"; exit 0; }; \
|
|
||||||
sleep 1; \
|
|
||||||
done; \
|
|
||||||
echo "!! smoke server did not respond on $(SMOKE_BASE_URL) — check $(SMOKE_LOG_FILE)" >&2; \
|
|
||||||
systemctl --user status --no-pager $(SMOKE_UNIT) || true; \
|
|
||||||
exit 1
|
|
||||||
|
|
||||||
smoke-stop: ## Stop the smoke server
|
|
||||||
systemctl --user stop $(SMOKE_UNIT) || true
|
|
||||||
@systemctl --user reset-failed $(SMOKE_UNIT) >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
smoke-status: ## Show status of the smoke server
|
|
||||||
@systemctl --user status --no-pager $(SMOKE_UNIT) 2>&1 | head -20 || true
|
|
||||||
|
|
||||||
smoke-logs: ## Tail the smoke server log
|
|
||||||
tail -50 $(SMOKE_LOG_FILE)
|
|
||||||
|
|
||||||
smoke-deploy: build smoke-restart ## Rebuild + restage agent into smoke + restart server (full per-CLAUDE.md cycle)
|
|
||||||
@echo "==> restaging agent + install assets into $(SMOKE_DATA_DIR)"
|
|
||||||
cp $(AGENT_BIN) $(SMOKE_DATA_DIR)/agent-binaries/restic-manager-agent-linux-amd64
|
|
||||||
cp deploy/install/install.sh $(SMOKE_DATA_DIR)/install/install.sh
|
|
||||||
cp deploy/install/install.ps1 $(SMOKE_DATA_DIR)/install/install.ps1
|
|
||||||
cp deploy/install/restic-manager-agent.service $(SMOKE_DATA_DIR)/install/restic-manager-agent.service
|
|
||||||
@echo "==> NOTE: this dev box's installed agent at /usr/local/bin/restic-manager-agent is NOT updated by this target."
|
|
||||||
@echo " Run the agent restage block in CLAUDE.md if your change touches agent code or the unit file."
|
|
||||||
|
|
||||||
release: ## Cross-compile for all supported platforms
|
release: ## Cross-compile for all supported platforms
|
||||||
@mkdir -p $(BIN_DIR)
|
@mkdir -p $(BIN_DIR)
|
||||||
@for target in linux/amd64 linux/arm64 windows/amd64; do \
|
@for target in linux/amd64 linux/arm64 windows/amd64; do \
|
||||||
|
|||||||
@@ -1,62 +1,36 @@
|
|||||||
# restic-manager
|
# restic-manager
|
||||||
|
|
||||||
Self-hosted, browser-based, single-pane-of-glass for managing
|
Self-hosted, browser-based, single-pane-of-glass for managing
|
||||||
[restic](https://restic.net) backups across a fleet of Linux and
|
[restic](https://restic.net) backups across a fleet of Linux and Windows
|
||||||
Windows endpoints.
|
endpoints.
|
||||||
|
|
||||||
> **Status:** pre-1.0, feature-complete for the original use
|
> Status: pre-alpha. Phase 0 (project bootstrap) complete; Phase 1 (MVP) in
|
||||||
> case. Phases 0–4 + 6 are landed (MVP, scheduling, restore,
|
> progress. See [`spec.md`](./spec.md) for the design and
|
||||||
> RBAC + OIDC, observability); Phase 5 (OSS readiness — docs site,
|
> [`tasks.md`](./tasks.md) for the roadmap.
|
||||||
> contributor onboarding, end-to-end CI) is in flight. See
|
|
||||||
> [`spec.md`](./spec.md) for the design and [`tasks.md`](./tasks.md)
|
|
||||||
> for the live roadmap.
|
|
||||||
|
|
||||||
## What it does
|
## What it does (target)
|
||||||
|
|
||||||
- Central visibility into backup state for every endpoint.
|
- Central visibility into backup state for every endpoint
|
||||||
- Trigger any restic operation remotely (`backup`, `forget`,
|
- Trigger any restic operation remotely (`backup`, `forget`, `prune`,
|
||||||
`prune`, `check`, `unlock`, `snapshots`, `stats`, `diff`,
|
`check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`)
|
||||||
`restore`).
|
- Manage per-host backup schedules from the UI
|
||||||
- Per-host schedules with named source groups + retention.
|
- Live job progress streamed back to the UI
|
||||||
- Live job log streamed to the browser; downloadable as
|
- Restore wizard (browse snapshots, pick paths, restore to original or
|
||||||
text/NDJSON afterwards.
|
alternate host)
|
||||||
- Restore wizard: browse a snapshot's tree, pick paths, restore
|
- Repo health surfacing (size, dedup ratio, last check, lock state)
|
||||||
in-place or to a new directory.
|
- Alerting on failure or staleness
|
||||||
- Repo health surfacing (size, raw size, last check, lock state),
|
- Cross-platform agent (Linux + Windows)
|
||||||
plus a 30/90-day repo-size trend.
|
- Ransomware-resistant repo access via append-only credentials
|
||||||
- Alerting over webhook, ntfy, or SMTP.
|
|
||||||
- Cross-platform agent (Linux systemd + Windows SCM).
|
|
||||||
- Append-only-friendly: separate admin credential for prune.
|
|
||||||
- Optional Prometheus `/metrics` endpoint + sample Grafana
|
|
||||||
dashboard.
|
|
||||||
- Optional OIDC SSO (Authelia, Authentik, etc.).
|
|
||||||
|
|
||||||
## Screenshots
|
## Architecture (one-line summary)
|
||||||
|
|
||||||
| Sign in | Empty dashboard | Add host |
|
A small Go control-plane on the Proxmox host, lightweight Go agents on each
|
||||||
|:-------:|:---------------:|:--------:|
|
endpoint that hold an outbound WebSocket to the control-plane, and a
|
||||||
|  |  |  |
|
`restic/rest-server` on Unraid that holds the actual backup data. The
|
||||||
|
control-plane never touches backup bytes.
|
||||||
| Alerts | Settings | Audit log |
|
|
||||||
|:------:|:--------:|:---------:|
|
|
||||||
|  |  |  |
|
|
||||||
|
|
||||||
(Screenshots from a fresh smoke install with no hosts. A populated
|
|
||||||
fleet view and the live-log + restore wizard surfaces are part of
|
|
||||||
the docs site under [`docs/book/`](./docs/book) — `make docs` to
|
|
||||||
render locally.)
|
|
||||||
|
|
||||||
## Architecture (one-line)
|
|
||||||
|
|
||||||
A small Go control-plane in Docker, lightweight Go agents on each
|
|
||||||
endpoint holding an outbound WebSocket to the control-plane, and
|
|
||||||
a restic repository (rest-server, S3, B2, SFTP — anything restic
|
|
||||||
speaks) that holds the actual backup data. **The control-plane
|
|
||||||
never touches backup bytes.**
|
|
||||||
|
|
||||||
Full architecture diagram and component breakdown:
|
Full architecture diagram and component breakdown:
|
||||||
[`spec.md` §3](./spec.md), or the rendered version in the
|
[`spec.md` §3](./spec.md).
|
||||||
[docs site](./docs/book/src/concepts/architecture.md).
|
|
||||||
|
|
||||||
## Repository layout
|
## Repository layout
|
||||||
|
|
||||||
@@ -64,63 +38,31 @@ Full architecture diagram and component breakdown:
|
|||||||
cmd/server/ control-plane binary
|
cmd/server/ control-plane binary
|
||||||
cmd/agent/ endpoint agent binary
|
cmd/agent/ endpoint agent binary
|
||||||
internal/api shared API types (REST + WS envelopes)
|
internal/api shared API types (REST + WS envelopes)
|
||||||
internal/server/ HTTP, WS, UI handlers, alert engine
|
internal/server/ HTTP, WS, UI handlers
|
||||||
internal/agent/ service integration, restic runner, local scheduler
|
internal/agent/ service integration, restic runner, local scheduler
|
||||||
internal/restic restic CLI wrapper
|
internal/restic restic CLI wrapper
|
||||||
internal/store SQLite persistence
|
internal/store SQLite persistence
|
||||||
internal/crypto secret encryption (AEAD)
|
internal/crypto secret encryption
|
||||||
internal/auth passwords, sessions, agent tokens
|
internal/auth passwords, sessions, agent tokens
|
||||||
web/ server-rendered templates + static assets
|
web/ server-rendered templates + static assets
|
||||||
deploy/ Dockerfile, docker-compose.yml, install scripts, Grafana dashboard
|
deploy/ Dockerfile, docker-compose.yml, install scripts
|
||||||
docs/ prose docs + the mdBook site under docs/book
|
design/ UI wireframes (Phase 0 design pass)
|
||||||
e2e/ compose stack + Playwright tests for end-to-end CI
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quickstart
|
|
||||||
|
|
||||||
The reference deployment is a single Docker container fronted by
|
|
||||||
your existing reverse proxy. See the [installation guide](docs/book/src/getting-started/install.md)
|
|
||||||
for the full path; the very short version:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
export RM_VERSION=v0.9.0 # pin a real tag
|
|
||||||
export RM_BASE_URL=https://restic.example.com
|
|
||||||
export RM_TRUSTED_PROXY=10.0.0.0/8
|
|
||||||
docker compose -f deploy/docker-compose.yml up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
The server prints a one-time bootstrap token to the log on first
|
|
||||||
start. POST it to `/api/bootstrap` (or open `/bootstrap` in a
|
|
||||||
browser) to create the admin user.
|
|
||||||
|
|
||||||
## Local development
|
## Local development
|
||||||
|
|
||||||
Requires Go 1.25+. The floor is set by `modernc.org/sqlite` v1.50.
|
Requires Go 1.25+ (built and tested on 1.26). The floor is set by
|
||||||
|
`modernc.org/sqlite` v1.50.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
make build # builds cmd/server and cmd/agent into ./bin
|
make build # builds cmd/server and cmd/agent into ./bin
|
||||||
make test # runs go test ./...
|
make test # runs go test ./...
|
||||||
make lint # runs golangci-lint
|
make lint # runs golangci-lint
|
||||||
make smoke-restart # systemd --user smoke server (see CLAUDE.md)
|
make run-server # runs the server (dev defaults)
|
||||||
make docs # renders the mdBook site to docs/book/book/
|
|
||||||
```
|
```
|
||||||
|
|
||||||
End-to-end test harness against a Docker Compose stack with a
|
|
||||||
sibling Linux agent: see [`docs/e2e.md`](docs/e2e.md). Runs in CI
|
|
||||||
on every PR.
|
|
||||||
|
|
||||||
## Documentation
|
|
||||||
|
|
||||||
- **Concepts and operator guides**: [docs site](docs/book/src/intro.md),
|
|
||||||
rendered with `make docs`.
|
|
||||||
- **Reverse-proxy setup**: [docs/reverse-proxy.md](docs/reverse-proxy.md).
|
|
||||||
- **Prometheus + Grafana**: [docs/prometheus.md](docs/prometheus.md).
|
|
||||||
- **End-to-end test harness**: [docs/e2e.md](docs/e2e.md).
|
|
||||||
- **Security policy**: [SECURITY.md](SECURITY.md).
|
|
||||||
- **Contributing**: [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
[PolyForm Noncommercial 1.0.0](./LICENSE). Free for personal,
|
PolyForm Noncommercial 1.0.0 — see [`LICENSE`](./LICENSE). Free for personal,
|
||||||
hobby, research, educational, governmental, and other noncommercial
|
hobby, research, educational, governmental, and other noncommercial use.
|
||||||
use. Commercial use requires a separate license.
|
Commercial use requires a separate license.
|
||||||
|
|||||||
-137
@@ -1,137 +0,0 @@
|
|||||||
# Security policy
|
|
||||||
|
|
||||||
restic-manager handles credentials that grant access to backup
|
|
||||||
repositories — losing them means an attacker can read or destroy a
|
|
||||||
fleet's backups. We take security reports seriously even at this
|
|
||||||
project's small scale.
|
|
||||||
|
|
||||||
## Supported versions
|
|
||||||
|
|
||||||
Pre-1.0, only the latest tagged release on `main` is supported.
|
|
||||||
Backporting fixes to older tags is not currently offered.
|
|
||||||
|
|
||||||
| Version | Supported |
|
|
||||||
|--------------------|----------------|
|
|
||||||
| `main` HEAD | Yes |
|
|
||||||
| Latest released tag| Yes |
|
|
||||||
| Anything older | No |
|
|
||||||
|
|
||||||
## Reporting a vulnerability
|
|
||||||
|
|
||||||
**Please don't open a public issue for security problems.**
|
|
||||||
|
|
||||||
Instead, use one of these private channels:
|
|
||||||
|
|
||||||
1. **Gitea private message** to the repository owner. The
|
|
||||||
instance is at <https://gitea.dcglab.co.uk> and the owner's
|
|
||||||
profile (`steve`) has direct-message contact set up.
|
|
||||||
2. **Email** to the address on the maintainer's Gitea profile.
|
|
||||||
Use a subject like `[SECURITY] restic-manager: <one-line summary>`
|
|
||||||
so it doesn't get lost. PGP optional — if you want to encrypt,
|
|
||||||
ask for a key first.
|
|
||||||
|
|
||||||
If you don't get an acknowledgement within **3 working days**,
|
|
||||||
please escalate through the other channel — solo maintainers do
|
|
||||||
miss things, and the goal here is to fix the problem, not to
|
|
||||||
preserve protocol.
|
|
||||||
|
|
||||||
### What to include
|
|
||||||
|
|
||||||
- A description of the issue and the impact (what does an attacker
|
|
||||||
gain? confidentiality, integrity, availability?).
|
|
||||||
- Affected component (server, agent, install script, docs).
|
|
||||||
- Affected version (`restic-manager-server --version`).
|
|
||||||
- Reproduction steps if you have them. A working PoC is welcome
|
|
||||||
but not required — a credible threat model is enough.
|
|
||||||
- Whether you intend to publish a writeup, and any timing
|
|
||||||
preferences.
|
|
||||||
|
|
||||||
### What we'll do
|
|
||||||
|
|
||||||
1. Acknowledge receipt within 3 working days.
|
|
||||||
2. Confirm or refute the issue, and agree a rough severity (CVSS
|
|
||||||
or just "this is bad / this isn't"). Asking clarifying
|
|
||||||
questions is normal at this stage — please don't read it as
|
|
||||||
foot-dragging.
|
|
||||||
3. Develop a fix on a private branch, test it, and prepare a
|
|
||||||
release.
|
|
||||||
4. Coordinate disclosure timing with you. The default is **30
|
|
||||||
days from confirmed report to public disclosure**, with a
|
|
||||||
patched release published before the disclosure date. Faster
|
|
||||||
if a workable PoC is already circulating; slower only by
|
|
||||||
mutual agreement.
|
|
||||||
5. Credit the reporter in the release notes (or omit the credit
|
|
||||||
if you'd rather stay anonymous — your choice).
|
|
||||||
|
|
||||||
## Scope
|
|
||||||
|
|
||||||
In scope:
|
|
||||||
|
|
||||||
- The server binary (`cmd/server`) and any HTTP, WebSocket, or CLI
|
|
||||||
surface it exposes.
|
|
||||||
- The agent binary (`cmd/agent`) and the way it consumes commands
|
|
||||||
from the server.
|
|
||||||
- The install scripts (`deploy/install/install.sh`, `install.ps1`)
|
|
||||||
and the systemd unit shipped with them.
|
|
||||||
- The docker-compose reference deployment and the docker image we
|
|
||||||
publish.
|
|
||||||
- Any cryptographic primitive choice or implementation detail
|
|
||||||
(AEAD, token hashing, session handling, OIDC handshake).
|
|
||||||
- Documentation that, if followed, leads operators into an
|
|
||||||
insecure configuration.
|
|
||||||
|
|
||||||
Out of scope (not because they aren't real problems, just not ones
|
|
||||||
this report channel can act on):
|
|
||||||
|
|
||||||
- Vulnerabilities in restic itself — report those upstream at
|
|
||||||
<https://github.com/restic/restic>.
|
|
||||||
- Vulnerabilities in third-party dependencies that haven't yet been
|
|
||||||
patched upstream — report upstream first.
|
|
||||||
- Issues that require pre-authenticated admin access on the control
|
|
||||||
plane (admins can already do everything; that's not a privilege
|
|
||||||
escalation, that's the design).
|
|
||||||
- DoS via resource exhaustion on a deployment without the
|
|
||||||
recommended reverse proxy / rate limiting in front (see
|
|
||||||
`docs/reverse-proxy.md`).
|
|
||||||
- Social-engineering scenarios that don't have a technical hook
|
|
||||||
into the project's own surfaces.
|
|
||||||
|
|
||||||
## Threat model summary
|
|
||||||
|
|
||||||
For context (longer version in [`spec.md`](./spec.md) §11):
|
|
||||||
|
|
||||||
- The server is **HTTP-only**; TLS termination, ACME, HSTS, and
|
|
||||||
edge rate-limiting are the reverse proxy's job.
|
|
||||||
- Credentials are encrypted at rest with an AEAD key loaded from
|
|
||||||
`RM_SECRET_KEY_FILE`. The same key encrypts agent secrets that
|
|
||||||
travel to the agent over the WS channel.
|
|
||||||
- Agents authenticate with bearer tokens issued at enrolment and
|
|
||||||
hashed at rest. Compromise of the server DB does **not** leak
|
|
||||||
bearer tokens in plaintext, but does leak the hashes (which is
|
|
||||||
enough to log in *as* the agent until the operator revokes —
|
|
||||||
see [NS-01 / NS-02](./tasks.md) for the revoke + regenerate
|
|
||||||
flows).
|
|
||||||
- The control plane intentionally **never touches backup bytes** —
|
|
||||||
the agent runs `restic` directly against the repo. A
|
|
||||||
compromised control plane can dispatch new jobs but cannot
|
|
||||||
exfiltrate snapshot contents in-band.
|
|
||||||
- Append-only credentials are first-class. Forget/prune jobs use a
|
|
||||||
separate, admin-marked credential that the server only pushes
|
|
||||||
for the duration of a maintenance dispatch.
|
|
||||||
|
|
||||||
## Hardening checklist for operators
|
|
||||||
|
|
||||||
- Run behind a TLS-terminating reverse proxy (Caddy/nginx/Traefik).
|
|
||||||
- Set `RM_TRUSTED_PROXY` to the proxy's CIDR so request IPs aren't
|
|
||||||
spoofable.
|
|
||||||
- Back up `RM_SECRET_KEY_FILE` separately from the database.
|
|
||||||
Without it the encrypted creds are unrecoverable.
|
|
||||||
- Use append-only credentials for the everyday backup path; only
|
|
||||||
the optional admin credential should have write/forget/prune
|
|
||||||
power.
|
|
||||||
- Disable users (don't delete) when staff change roles — bearer
|
|
||||||
tokens stay valid until rotated.
|
|
||||||
- Watch the alert and audit-log views during enrolment of new
|
|
||||||
hosts.
|
|
||||||
|
|
||||||
Thanks for helping keep restic-manager users safe.
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
# The ask!
|
|
||||||
|
|
||||||
I have numerous servers deployed out in a lab, mainly Linux but some Windows
|
|
||||||
All have restic installed on them
|
|
||||||
I need to build a browser based management service that allows me to have a central single-plane-of-glass to monitor and manage all teh endpoints
|
|
||||||
All endpoints will be enabled for SSH (unless other methods are better?)
|
|
||||||
|
|
||||||
Plan out how we would go about this please?
|
|
||||||
+4
-8
@@ -148,7 +148,6 @@ func run() error {
|
|||||||
resticBin: resticBin,
|
resticBin: resticBin,
|
||||||
resticVer: snap.ResticVersion,
|
resticVer: snap.ResticVersion,
|
||||||
resticSupportsNoOwnership: resticSupportsNoOwnership,
|
resticSupportsNoOwnership: resticSupportsNoOwnership,
|
||||||
serverURL: cfg.ServerURL,
|
|
||||||
secrets: sec,
|
secrets: sec,
|
||||||
scheduler: scheduler.New(),
|
scheduler: scheduler.New(),
|
||||||
}
|
}
|
||||||
@@ -215,7 +214,6 @@ type dispatcher struct {
|
|||||||
resticBin string
|
resticBin string
|
||||||
resticVer string // e.g. "0.17.1"; empty if restic isn't installed yet
|
resticVer string // e.g. "0.17.1"; empty if restic isn't installed yet
|
||||||
resticSupportsNoOwnership bool // captured at startup from `restic restore --help`
|
resticSupportsNoOwnership bool // captured at startup from `restic restore --help`
|
||||||
serverURL string // base URL of the server (used by the self-update fetch)
|
|
||||||
secrets *secrets.Store
|
secrets *secrets.Store
|
||||||
scheduler *scheduler.Scheduler
|
scheduler *scheduler.Scheduler
|
||||||
|
|
||||||
@@ -397,12 +395,10 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
|
|||||||
"up_kbps", up, "down_kbps", down)
|
"up_kbps", up, "down_kbps", down)
|
||||||
}
|
}
|
||||||
|
|
||||||
case api.MsgCommandUpdate:
|
case api.MsgAgentUpdateAvail:
|
||||||
var p api.CommandUpdatePayload
|
var p api.AgentUpdateAvailablePayload
|
||||||
if err := env.UnmarshalPayload(&p); err != nil {
|
_ = env.UnmarshalPayload(&p)
|
||||||
return fmt.Errorf("command.update: %w", err)
|
slog.Info("ws agent: update available", "version", p.LatestVersion, "url", p.PackageURL)
|
||||||
}
|
|
||||||
go d.runUpdate(ctx, p, tx)
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
slog.Debug("ws agent: ignored message", "type", env.Type)
|
slog.Debug("ws agent: ignored message", "type", env.Type)
|
||||||
|
|||||||
@@ -1,65 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/updater"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
|
||||||
)
|
|
||||||
|
|
||||||
// runUpdate handles a server-dispatched command.update. It logs progress
|
|
||||||
// via log.stream so the live job page captures pre-restart state, then
|
|
||||||
// calls the platform updater. On Linux the updater calls os.Exit; on
|
|
||||||
// Windows it spawns a detached helper and returns, with the agent then
|
|
||||||
// exiting.
|
|
||||||
//
|
|
||||||
// The terminal job state is set by the server, not the agent: success
|
|
||||||
// is "agent re-hellos with matching version" rather than anything the
|
|
||||||
// agent itself can assert. The only `job.finished` we send from here is
|
|
||||||
// on the failure path, before any restart attempt.
|
|
||||||
func (d *dispatcher) runUpdate(ctx context.Context, p api.CommandUpdatePayload, tx wsclient.Sender) {
|
|
||||||
logf := func(format string, args ...any) {
|
|
||||||
line := fmt.Sprintf(format, args...)
|
|
||||||
slog.Info("ws agent: update: " + line)
|
|
||||||
env, err := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
|
|
||||||
JobID: p.JobID,
|
|
||||||
TS: time.Now().UTC(),
|
|
||||||
Stream: api.LogStdout,
|
|
||||||
Payload: line,
|
|
||||||
})
|
|
||||||
if err == nil {
|
|
||||||
_ = tx.Send(env)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
startedEnv, err := api.Marshal(api.MsgJobStarted, "", api.JobStartedPayload{
|
|
||||||
JobID: p.JobID,
|
|
||||||
Kind: api.JobUpdate,
|
|
||||||
StartedAt: time.Now().UTC(),
|
|
||||||
})
|
|
||||||
if err == nil {
|
|
||||||
_ = tx.Send(startedEnv)
|
|
||||||
}
|
|
||||||
|
|
||||||
logf("fetching new binary from %s", d.serverURL)
|
|
||||||
if err := updater.Update(ctx, d.serverURL); err != nil {
|
|
||||||
logf("update failed: %v", err)
|
|
||||||
finishedEnv, mErr := api.Marshal(api.MsgJobFinished, "", api.JobFinishedPayload{
|
|
||||||
JobID: p.JobID,
|
|
||||||
Status: api.JobFailed,
|
|
||||||
FinishedAt: time.Now().UTC(),
|
|
||||||
Error: err.Error(),
|
|
||||||
})
|
|
||||||
if mErr == nil {
|
|
||||||
_ = tx.Send(finishedEnv)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Unreachable on Linux (Update calls os.Exit). On Windows control
|
|
||||||
// returns here while the detached helper does the swap-and-restart;
|
|
||||||
// the agent then exits cleanly so SCM hands off.
|
|
||||||
}
|
|
||||||
@@ -17,7 +17,6 @@ import (
|
|||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate"
|
|
||||||
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
||||||
@@ -92,7 +91,6 @@ func run() error {
|
|||||||
|
|
||||||
notifHub := notification.NewHub(st, aead, cfg.BaseURL)
|
notifHub := notification.NewHub(st, aead, cfg.BaseURL)
|
||||||
alertEngine := alert.NewEngine(st, notifHub)
|
alertEngine := alert.NewEngine(st, notifHub)
|
||||||
updateWatcher := ws.NewUpdateWatcher(st, alertEngine, jobHub)
|
|
||||||
|
|
||||||
renderer, err := ui.New()
|
renderer, err := ui.New()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -118,7 +116,6 @@ func run() error {
|
|||||||
JobHub: jobHub,
|
JobHub: jobHub,
|
||||||
AlertEngine: alertEngine,
|
AlertEngine: alertEngine,
|
||||||
NotificationHub: notifHub,
|
NotificationHub: notifHub,
|
||||||
UpdateWatcher: updateWatcher,
|
|
||||||
UI: renderer,
|
UI: renderer,
|
||||||
Version: version,
|
Version: version,
|
||||||
OIDC: oidcClient,
|
OIDC: oidcClient,
|
||||||
@@ -150,17 +147,10 @@ func run() error {
|
|||||||
|
|
||||||
srv := rmhttp.New(deps)
|
srv := rmhttp.New(deps)
|
||||||
|
|
||||||
// Fleet-update worker — built after the HTTP server because the
|
|
||||||
// dispatcher delegates back into srv.DispatchHostUpdate.
|
|
||||||
fleetWorker := fleetupdate.NewWorker(st, hub,
|
|
||||||
&serverDispatcher{srv: srv}, alertEngine)
|
|
||||||
srv.SetFleetWorker(fleetWorker)
|
|
||||||
|
|
||||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||||
defer stop()
|
defer stop()
|
||||||
|
|
||||||
go alertEngine.Run(ctx)
|
go alertEngine.Run(ctx)
|
||||||
go updateWatcher.Run(ctx)
|
|
||||||
|
|
||||||
errCh := make(chan error, 1)
|
errCh := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
@@ -253,12 +243,3 @@ func run() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// serverDispatcher adapts the http.Server's DispatchHostUpdate method
|
|
||||||
// to the fleetupdate.Dispatcher interface. Lives in main so the
|
|
||||||
// http and fleetupdate packages don't need to know about each other.
|
|
||||||
type serverDispatcher struct{ srv *rmhttp.Server }
|
|
||||||
|
|
||||||
func (d *serverDispatcher) DispatchUpdate(ctx context.Context, hostID, actorUserID string) (string, string, error) {
|
|
||||||
return d.srv.DispatchHostUpdate(ctx, hostID, actorUserID)
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -52,12 +52,7 @@ ProtectSystem=full
|
|||||||
# whenever a new SecretsKey is minted, so we need a targeted
|
# whenever a new SecretsKey is minted, so we need a targeted
|
||||||
# write-exemption for that dir. No exemption for the rest of /etc:
|
# write-exemption for that dir. No exemption for the rest of /etc:
|
||||||
# the agent has no business editing /etc/passwd, /etc/sudoers, etc.
|
# the agent has no business editing /etc/passwd, /etc/sudoers, etc.
|
||||||
#
|
ReadWritePaths=/etc/restic-manager
|
||||||
# /usr/local/bin is writable so the self-update flow (P6-01) can
|
|
||||||
# atomic-rename a fresh binary over the running one. Permitting the
|
|
||||||
# whole directory (rather than just the binary path) is required
|
|
||||||
# because os.Rename takes a write lock on the parent dir.
|
|
||||||
ReadWritePaths=/etc/restic-manager /usr/local/bin
|
|
||||||
ProtectHostname=true
|
ProtectHostname=true
|
||||||
ProtectKernelTunables=true
|
ProtectKernelTunables=true
|
||||||
ProtectKernelModules=true
|
ProtectKernelModules=true
|
||||||
|
|||||||
@@ -1,19 +0,0 @@
|
|||||||
[book]
|
|
||||||
title = "restic-manager"
|
|
||||||
description = "Self-hosted control plane for restic backups across a fleet of Linux and Windows endpoints."
|
|
||||||
authors = ["Steve Cliff"]
|
|
||||||
language = "en-GB"
|
|
||||||
multilingual = false
|
|
||||||
src = "src"
|
|
||||||
|
|
||||||
[output.html]
|
|
||||||
default-theme = "ayu"
|
|
||||||
preferred-dark-theme = "ayu"
|
|
||||||
git-repository-url = "https://gitea.dcglab.co.uk/steve/restic-manager"
|
|
||||||
git-repository-icon = "fa-code-fork"
|
|
||||||
edit-url-template = "https://gitea.dcglab.co.uk/steve/restic-manager/_edit/main/docs/book/{path}"
|
|
||||||
no-section-label = false
|
|
||||||
|
|
||||||
[output.html.fold]
|
|
||||||
enable = true
|
|
||||||
level = 2
|
|
||||||
@@ -1,40 +0,0 @@
|
|||||||
# Summary
|
|
||||||
|
|
||||||
[Introduction](./intro.md)
|
|
||||||
|
|
||||||
# Getting started
|
|
||||||
|
|
||||||
- [Installing the server](./getting-started/install.md)
|
|
||||||
- [Enrolling your first host](./getting-started/enrolling-hosts.md)
|
|
||||||
- [Running behind a reverse proxy](./getting-started/reverse-proxy.md)
|
|
||||||
|
|
||||||
# Concepts
|
|
||||||
|
|
||||||
- [Architecture](./concepts/architecture.md)
|
|
||||||
- [Credentials and how they flow](./concepts/credentials.md)
|
|
||||||
- [Schedules and source groups](./concepts/schedules-and-source-groups.md)
|
|
||||||
- [Repo maintenance](./concepts/repo-maintenance.md)
|
|
||||||
|
|
||||||
# Operations
|
|
||||||
|
|
||||||
- [Backups and restores](./operations/backups-and-restores.md)
|
|
||||||
- [Alerts and notifications](./operations/alerts.md)
|
|
||||||
- [Observability with Prometheus](./operations/observability.md)
|
|
||||||
- [Updating agents](./operations/updates.md)
|
|
||||||
|
|
||||||
# Security
|
|
||||||
|
|
||||||
- [Threat model](./security/threat-model.md)
|
|
||||||
- [Hardening checklist](./security/hardening.md)
|
|
||||||
- [Reporting vulnerabilities](./security/disclosure.md)
|
|
||||||
|
|
||||||
# Reference
|
|
||||||
|
|
||||||
- [Environment variables](./reference/env-vars.md)
|
|
||||||
- [HTTP endpoints](./reference/http-endpoints.md)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
[Contributing](./contributing.md)
|
|
||||||
[Roadmap](./roadmap.md)
|
|
||||||
[License](./license.md)
|
|
||||||
@@ -1,121 +0,0 @@
|
|||||||
# Architecture
|
|
||||||
|
|
||||||
## Components
|
|
||||||
|
|
||||||
```
|
|
||||||
┌────────────────────────────────────────────────────────────┐
|
|
||||||
│ Server (control plane, single process) │
|
|
||||||
│ * chi-based HTTP API + HTMX server-rendered UI │
|
|
||||||
│ * WebSocket hub for agent fan-out + browser fan-out │
|
|
||||||
│ * SQLite store (modernc.org/sqlite, pure Go) │
|
|
||||||
│ * AEAD encryption helpers │
|
|
||||||
│ * Alert engine + notification hub │
|
|
||||||
└────────────┬───────────────────────────────────┬───────────┘
|
|
||||||
│ outbound WS only │ HTTP(S)
|
|
||||||
│ │
|
|
||||||
┌────────────▼─────────────┐ ┌────────────▼─────────────┐
|
|
||||||
│ Agent (per host) │ │ Browser (operator) │
|
|
||||||
│ * coder/websocket │ │ * htmx + a tiny bit │
|
|
||||||
│ * cron for schedules │ │ of vanilla JS for │
|
|
||||||
│ * restic wrapper │ │ live job updates │
|
|
||||||
│ * sysinfo collector │ └──────────────────────────┘
|
|
||||||
└────────────┬─────────────┘
|
|
||||||
│ subprocess: restic ...
|
|
||||||
│
|
|
||||||
┌────────────▼─────────────────────────────────────────────────┐
|
|
||||||
│ restic repository (rest-server, S3, B2, SFTP, local …) │
|
|
||||||
│ Backup data flows directly here. Server never touches it. │
|
|
||||||
└──────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Why outbound-only WebSockets?
|
|
||||||
|
|
||||||
The agent dials the server on `/ws/agent` with a bearer token. The
|
|
||||||
server doesn't initiate connections to the agent. Three reasons:
|
|
||||||
|
|
||||||
1. **Firewall friendliness.** Nothing on the endpoint needs an
|
|
||||||
inbound port; this works behind the typical "branch office NAT"
|
|
||||||
without router config.
|
|
||||||
2. **Single auth point.** The bearer token is the only credential
|
|
||||||
that crosses the boundary; the agent never accepts an
|
|
||||||
incoming socket.
|
|
||||||
3. **Reconnect semantics are simpler.** When the connection drops
|
|
||||||
(NAT timeout, server restart, transient network glitch) the
|
|
||||||
agent backs off and re-dials; the server marks the host
|
|
||||||
offline after 90s and lets the alert engine raise a stale-host
|
|
||||||
alert.
|
|
||||||
|
|
||||||
## Why SQLite?
|
|
||||||
|
|
||||||
SQLite covers the project's HA non-goal: there isn't one. A small
|
|
||||||
control plane managing twelve endpoints does not need replication
|
|
||||||
or a separate database tier. SQLite gives us:
|
|
||||||
|
|
||||||
- A single file to back up (plus the secret key).
|
|
||||||
- Hand-rolled migrations under `internal/store/migrations/` —
|
|
||||||
no migration framework lock-in.
|
|
||||||
- `WAL` mode plus per-connection foreign-key enforcement.
|
|
||||||
|
|
||||||
The migrations file the entire schema; there's no ORM or
|
|
||||||
query-builder layer between Go code and SQL.
|
|
||||||
|
|
||||||
## Why the agent runs `restic` itself, not via the server
|
|
||||||
|
|
||||||
The control plane never holds backup bytes in flight. That's
|
|
||||||
deliberate:
|
|
||||||
|
|
||||||
- A compromised control plane cannot exfiltrate snapshot
|
|
||||||
contents in-band — at worst it can dispatch new backup or
|
|
||||||
forget jobs (audit-logged) but the data path is between the
|
|
||||||
agent and the repository.
|
|
||||||
- The same agent process can target whichever transport restic
|
|
||||||
natively supports (rest-server, S3, B2, SFTP, local), no
|
|
||||||
separate mux on the server side.
|
|
||||||
|
|
||||||
## Job lifecycle
|
|
||||||
|
|
||||||
```
|
|
||||||
┌──────────────────────┐
|
|
||||||
operator → │ POST /hosts/{id}/ │
|
|
||||||
│ run-backup │
|
|
||||||
└──────────┬───────────┘
|
|
||||||
│ 1. INSERT INTO jobs (status='queued')
|
|
||||||
│ 2. dispatch command.run over WS
|
|
||||||
▼
|
|
||||||
┌──────────────────────┐
|
|
||||||
│ Agent dispatches │
|
|
||||||
│ restic subprocess │
|
|
||||||
└──────────┬───────────┘
|
|
||||||
│
|
|
||||||
│ 3. job.started ───▶ store.MarkJobStarted
|
|
||||||
│ 4. job.progress ───▶ JobHub broadcast (live UI)
|
|
||||||
│ 5. log.stream ───▶ append to job_logs
|
|
||||||
│ 6. job.finished ───▶ store.MarkJobFinished
|
|
||||||
│ + alert engine eval
|
|
||||||
│ + (P6) metrics histogram
|
|
||||||
▼
|
|
||||||
terminal: succeeded | failed | cancelled
|
|
||||||
```
|
|
||||||
|
|
||||||
Operators see live updates because the browser subscribes to
|
|
||||||
`/api/jobs/{id}/stream`, and the WS handler broadcasts each
|
|
||||||
agent-emitted envelope to all live subscribers in addition to
|
|
||||||
persisting it.
|
|
||||||
|
|
||||||
## What scheduling looks like
|
|
||||||
|
|
||||||
- The agent runs a local `robfig/cron/v3` instance.
|
|
||||||
- The server pushes the desired schedule set to the agent on
|
|
||||||
hello + after every CRUD change.
|
|
||||||
- When the agent's cron fires, it sends `schedule.fire` to the
|
|
||||||
server. The server creates a job row, sends `command.run` back,
|
|
||||||
and the agent dispatches a normal backup.
|
|
||||||
- If the WS drops between fire and run, the server queues the
|
|
||||||
schedule firing into `pending_runs` and drains on agent
|
|
||||||
reconnect — no missed scheduled backups due to network blips.
|
|
||||||
|
|
||||||
For everything that isn't a backup (forget, prune, check), the
|
|
||||||
server runs a 60-second maintenance ticker against
|
|
||||||
`host_repo_maintenance` rows and dispatches the relevant command
|
|
||||||
when a cadence is due. The agent's local cron only handles
|
|
||||||
backups.
|
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
# Credentials and how they flow
|
|
||||||
|
|
||||||
restic-manager handles three credential surfaces:
|
|
||||||
|
|
||||||
1. **Operator credentials** — the username + password (or OIDC
|
|
||||||
identity) that logs into the UI.
|
|
||||||
2. **Agent bearer tokens** — issued at enrolment, used by the
|
|
||||||
agent to authenticate its WebSocket to the server.
|
|
||||||
3. **Repo credentials** — the rest-server / S3 / B2 / SFTP
|
|
||||||
credentials the agent passes to `restic` itself.
|
|
||||||
|
|
||||||
Each has a different threat model and storage strategy.
|
|
||||||
|
|
||||||
## Operator credentials
|
|
||||||
|
|
||||||
- Local users are stored in `users` with a bcrypt password hash.
|
|
||||||
- Sessions are random tokens minted at login, stored hashed in
|
|
||||||
the `sessions` table, expired after 24h. Cookie is HttpOnly,
|
|
||||||
SameSite=Lax, and Secure (when `RM_COOKIE_SECURE=true`,
|
|
||||||
default).
|
|
||||||
- OIDC users carry `auth_source='oidc'` and an `oidc_subject`
|
|
||||||
pinning their IdP identity. Local password login is rejected
|
|
||||||
for OIDC users.
|
|
||||||
- Disabling a user soft-deletes them via `disabled_at` —
|
|
||||||
pre-existing sessions are invalidated on the next request.
|
|
||||||
|
|
||||||
## Agent bearer tokens
|
|
||||||
|
|
||||||
- Minted at enrolment, hashed at rest with `auth.HashToken`.
|
|
||||||
- The plaintext token only exists in memory at enrolment time
|
|
||||||
and on the agent's filesystem (`/etc/restic-manager/agent.yaml`,
|
|
||||||
mode `0600`, owned by the service user).
|
|
||||||
- Compromise of the server DB leaks the hashes, which is enough
|
|
||||||
to *log in as that agent* until you revoke. Compromise of the
|
|
||||||
agent host leaks the plaintext (via the config file) — same
|
|
||||||
end result.
|
|
||||||
- Rotation: re-enrol the host. Today there's no in-place rotate;
|
|
||||||
the operator deletes the host (which cascades, including
|
|
||||||
revoking the bearer hash) and re-runs the install command.
|
|
||||||
|
|
||||||
## Repo credentials
|
|
||||||
|
|
||||||
This is the credential that ultimately matters for backup
|
|
||||||
integrity. restic-manager keeps two slots per host:
|
|
||||||
|
|
||||||
- **The everyday credential** (`host_credentials.kind = ''`).
|
|
||||||
Append-only-friendly: this is the one your backup schedule
|
|
||||||
uses. It can write but not delete or forget.
|
|
||||||
- **The admin credential** (`host_credentials.kind = 'admin'`).
|
|
||||||
Has full delete rights. Only pushed to the agent transiently
|
|
||||||
while a `prune` or `forget` job is dispatching, and discarded
|
|
||||||
by the agent after the job ends.
|
|
||||||
|
|
||||||
### Encryption flow
|
|
||||||
|
|
||||||
1. Operator types the credential into the UI or the install form.
|
|
||||||
2. Server AEAD-encrypts the cred (`crypto.AEAD.Encrypt`) using the
|
|
||||||
key in `RM_SECRET_KEY_FILE`. The plaintext is dropped from
|
|
||||||
memory.
|
|
||||||
3. Encrypted blob is stored in `host_credentials.cred_blob`.
|
|
||||||
4. When the agent connects, the server decrypts the blob and
|
|
||||||
sends the **plaintext** down the WebSocket inside a
|
|
||||||
`config.update` envelope.
|
|
||||||
5. The agent stores the plaintext in its in-memory secrets store
|
|
||||||
for the lifetime of the process; it's reloaded fresh on every
|
|
||||||
server-side push.
|
|
||||||
6. When a job runs, the agent merges the credential into the
|
|
||||||
restic environment (`restic.Env.RepoURL` stays bare; the
|
|
||||||
`user:pass@…` form is built only inside `envSlice()` at the
|
|
||||||
moment of `exec.Command`).
|
|
||||||
|
|
||||||
The merged form is **never logged**. The slog package's structured
|
|
||||||
output gets `restic.RedactURL()` for any URL it has cause to
|
|
||||||
mention.
|
|
||||||
|
|
||||||
### Why push plaintext over the wire?
|
|
||||||
|
|
||||||
The transport itself is the trust boundary: the WebSocket runs
|
|
||||||
inside the same TLS-terminated reverse-proxy connection your
|
|
||||||
browser uses, and the agent has already authenticated with its
|
|
||||||
bearer token. Re-encrypting the payload on top of that would just
|
|
||||||
move the key-management problem somewhere else.
|
|
||||||
|
|
||||||
If your reverse proxy isn't TLS-terminated, the deployment is
|
|
||||||
already broken — see [Hardening](../security/hardening.md).
|
|
||||||
|
|
||||||
## Setup tokens (admin-driven)
|
|
||||||
|
|
||||||
When an admin creates a new user, the server mints a one-time
|
|
||||||
setup link valid for 1 hour. The hash is stored; the raw token
|
|
||||||
is shown to the admin once. The user opens the link, sets a
|
|
||||||
password, and is dropped into a session. Expired tokens are
|
|
||||||
swept on the alert engine's 60s tick.
|
|
||||||
|
|
||||||
Same pattern for enrolment tokens: the raw token only exists in
|
|
||||||
memory at mint time, and the install snippet is the operator's
|
|
||||||
only chance to capture it. If you lose it, regenerate via the
|
|
||||||
**Add host** page (NS-02).
|
|
||||||
@@ -1,85 +0,0 @@
|
|||||||
# Repo maintenance
|
|
||||||
|
|
||||||
Backups go in; without maintenance, repos grow forever and
|
|
||||||
eventually fall over. restic-manager runs three maintenance
|
|
||||||
operations on a per-host cadence:
|
|
||||||
|
|
||||||
| Command | What it does | Default cadence |
|
|
||||||
|----------|-------------------------------------------------------------|-----------------|
|
|
||||||
| `forget` | Marks snapshots eligible for removal per the retention policy attached to each source group. Cheap; runs append-only. | Daily after the last backup of the day |
|
|
||||||
| `prune` | Reclaims space from the repo. Requires the **admin** credential (write+delete). | Weekly, off-peak |
|
|
||||||
| `check` | Verifies repo integrity. Sub-options surface lock state. | Weekly, with `--read-data-subset N%` to sample pack files |
|
|
||||||
|
|
||||||
A new field on each host row, `host_repo_maintenance`, holds the
|
|
||||||
cron expressions and last-fire anchors. The maintenance ticker on
|
|
||||||
the server runs every 60s, finds hosts whose next-fire is due,
|
|
||||||
and dispatches the right command. The agent's local cron is
|
|
||||||
**only** for backups.
|
|
||||||
|
|
||||||
## Why server-side and not agent-side?
|
|
||||||
|
|
||||||
The agent's cron knows about backups because backups are
|
|
||||||
per-source-group. Maintenance is per-repo, not per-source-group,
|
|
||||||
so doing it server-side keeps the per-host wiring simple:
|
|
||||||
|
|
||||||
- One ticker, not N agent crons to keep in sync.
|
|
||||||
- Cancelling a maintenance dispatch is just "don't dispatch the
|
|
||||||
next one" — no agent-side state to clean up.
|
|
||||||
- Skipping offline hosts is trivial (no queue; only scheduled
|
|
||||||
*backups* queue into `pending_runs`).
|
|
||||||
|
|
||||||
## Forget and the multi-group payload
|
|
||||||
|
|
||||||
A single `forget` job can target several source groups at once.
|
|
||||||
The wire envelope (`ForgetGroups`) carries one entry per group,
|
|
||||||
each with its retention policy. The agent runs N
|
|
||||||
`restic forget --tag <name> --keep-...` invocations in sequence,
|
|
||||||
streams their output, and reports a single terminal status.
|
|
||||||
|
|
||||||
## Prune and the admin credential
|
|
||||||
|
|
||||||
Prune mutates the repo. The everyday append-only credential
|
|
||||||
**cannot** prune — that's the whole point of append-only.
|
|
||||||
restic-manager keeps a second slot per host (`kind = 'admin'`)
|
|
||||||
for the credential that can.
|
|
||||||
|
|
||||||
When a prune is dispatched (cadence-driven or operator-driven):
|
|
||||||
|
|
||||||
1. Server pushes the admin credential to the agent in a fresh
|
|
||||||
`config.update`.
|
|
||||||
2. Agent runs `restic prune` with the merged credential.
|
|
||||||
3. Job finishes; agent discards the admin credential from its
|
|
||||||
in-memory secrets store.
|
|
||||||
|
|
||||||
The server never logs the merged URL (see
|
|
||||||
[Credentials](./credentials.md)).
|
|
||||||
|
|
||||||
## Check and lock state
|
|
||||||
|
|
||||||
`restic check` warns about stale locks when it finds them. The
|
|
||||||
agent ships every check's output back as a `repo.stats` envelope
|
|
||||||
and a stream of log lines; if a stale lock is detected, the
|
|
||||||
**Repo** page surfaces a banner with an **Unlock** button. The
|
|
||||||
operator-only `unlock` command runs `restic unlock` and clears
|
|
||||||
the banner.
|
|
||||||
|
|
||||||
`unlock` has no cadence — it's a manual action, never automatic.
|
|
||||||
Auto-unlocking would mask the cause (probably a previously
|
|
||||||
crashed long-running operation) and risk corrupting an
|
|
||||||
operation the operator has merely lost track of.
|
|
||||||
|
|
||||||
## Repo stats
|
|
||||||
|
|
||||||
After every backup, check, prune, and unlock, the agent runs
|
|
||||||
`restic stats --json --mode raw-data` and ships the result as a
|
|
||||||
`repo.stats` envelope. The server stores this in
|
|
||||||
`host_repo_stats` (latest only) and `host_repo_stats_history`
|
|
||||||
(one row per host per day, last-write-wins per column — a
|
|
||||||
prune-only patch never nulls a backup-time size).
|
|
||||||
|
|
||||||
The host detail page surfaces:
|
|
||||||
|
|
||||||
- Total size + raw size in the vitals strip.
|
|
||||||
- Last-check timestamp + colour-coded status.
|
|
||||||
- Last-prune timestamp.
|
|
||||||
- 30/90-day repo size trend chart.
|
|
||||||
@@ -1,105 +0,0 @@
|
|||||||
# Schedules and source groups
|
|
||||||
|
|
||||||
Two related but separable ideas:
|
|
||||||
|
|
||||||
- A **source group** is a named bundle of "what to back up":
|
|
||||||
include paths, exclude patterns, retention policy, retry
|
|
||||||
configuration, optional pre/post hooks. The group's name is
|
|
||||||
used as the restic snapshot tag, so retention can target it
|
|
||||||
with `restic forget --tag <name>`.
|
|
||||||
- A **schedule** is a cron expression that, when it fires,
|
|
||||||
triggers a backup of one or more source groups on a host.
|
|
||||||
|
|
||||||
Decoupling them means you can have one schedule covering several
|
|
||||||
groups (e.g. `0 1 * * *` running both `system` and `data`), and
|
|
||||||
each group has its own retention without duplicating policy
|
|
||||||
across schedules.
|
|
||||||
|
|
||||||
## Source group anatomy
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
name: data
|
|
||||||
includes:
|
|
||||||
- /var/lib/postgresql
|
|
||||||
- /home
|
|
||||||
excludes:
|
|
||||||
- /home/*/.cache
|
|
||||||
- /home/*/Downloads
|
|
||||||
retention:
|
|
||||||
keep_last: 7
|
|
||||||
keep_daily: 14
|
|
||||||
keep_weekly: 4
|
|
||||||
keep_monthly: 6
|
|
||||||
retry_max: 3
|
|
||||||
retry_backoff_seconds: 600
|
|
||||||
pre_hook: |
|
|
||||||
pg_dump -U postgres -F c -f /var/lib/postgresql/dumps/all.dump
|
|
||||||
post_hook: |
|
|
||||||
rm -f /var/lib/postgresql/dumps/all.dump
|
|
||||||
```
|
|
||||||
|
|
||||||
### Conflict detection
|
|
||||||
|
|
||||||
If your retention policy says `keep_hourly: 24` but no schedule
|
|
||||||
points at this group sub-daily, the UI surfaces a
|
|
||||||
**conflict-dimension banner** ("`hourly` won't be honoured —
|
|
||||||
no schedule fires more often than once a day"). The flag is
|
|
||||||
stored on the source group (`conflict_dimension`) and refreshed
|
|
||||||
whenever a schedule or group changes.
|
|
||||||
|
|
||||||
### Hooks
|
|
||||||
|
|
||||||
`pre_hook` and `post_hook` run on the agent host inside
|
|
||||||
`/bin/sh -c` (`cmd.exe /C` on Windows). Output is streamed back
|
|
||||||
to the live job log as `hook(<phase>): …` lines.
|
|
||||||
|
|
||||||
- A non-zero `pre_hook` exit aborts the backup.
|
|
||||||
- `post_hook` always runs, with `RM_JOB_STATUS=succeeded|failed`
|
|
||||||
in the environment. Use this for cleanup that must happen
|
|
||||||
whether the backup worked or not.
|
|
||||||
- Hooks only run for `kind=backup` jobs. They do not run for
|
|
||||||
`forget`, `prune`, `check`, etc.
|
|
||||||
- AEAD-encrypted at rest at the HTTP layer; the agent receives
|
|
||||||
plaintext over the WS channel.
|
|
||||||
|
|
||||||
A "host default" pair of hooks lives on the host itself; a
|
|
||||||
source group's own hooks override them when set.
|
|
||||||
|
|
||||||
## Schedule anatomy
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
cron: "0 2 * * *"
|
|
||||||
enabled: true
|
|
||||||
source_group_ids:
|
|
||||||
- <gid for "data">
|
|
||||||
- <gid for "system">
|
|
||||||
```
|
|
||||||
|
|
||||||
Slim by design: a schedule says **when** and **which groups**.
|
|
||||||
Everything else (paths, retention, hooks) lives on the groups.
|
|
||||||
|
|
||||||
The agent's local cron fires the schedule. If the WebSocket is
|
|
||||||
down at fire time, the server queues the firing into
|
|
||||||
`pending_runs` and drains it on the next agent reconnect — a
|
|
||||||
short network blip won't lose the backup.
|
|
||||||
|
|
||||||
### Last / next run
|
|
||||||
|
|
||||||
The schedules tab shows "next" (computed by parsing the cron
|
|
||||||
expression with `robfig/cron/v3`) and "last" (the latest
|
|
||||||
`actor_kind=schedule` job in the `jobs` table) for every
|
|
||||||
schedule. The dashboard host row also surfaces `next 12h ago/from
|
|
||||||
now` when a single covering schedule is the run-now candidate.
|
|
||||||
|
|
||||||
## Bandwidth limits
|
|
||||||
|
|
||||||
Two places set restic's `--limit-upload` / `--limit-download`:
|
|
||||||
|
|
||||||
1. **Host-wide caps** on the host row (`bandwidth_up_kbps`,
|
|
||||||
`bandwidth_down_kbps`). Pushed to the agent on hello and
|
|
||||||
after `PUT /api/hosts/{id}/bandwidth`. Apply to every restic
|
|
||||||
invocation on the host.
|
|
||||||
2. **Per-job overrides** on the per-source-group Run-now form.
|
|
||||||
Win over host caps for the lifetime of that one job.
|
|
||||||
|
|
||||||
If neither is set, restic runs unthrottled.
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
# Contributing
|
|
||||||
|
|
||||||
Full contributor guide:
|
|
||||||
[`CONTRIBUTING.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CONTRIBUTING.md)
|
|
||||||
in the repository root.
|
|
||||||
|
|
||||||
The short version:
|
|
||||||
|
|
||||||
- Open an issue first for non-trivial changes; the design is
|
|
||||||
still moving and unsolicited large PRs may conflict with
|
|
||||||
in-flight work.
|
|
||||||
- `make lint test` must pass.
|
|
||||||
- One logical change per commit, no `Co-Authored-By` trailers.
|
|
||||||
- UK English in identifiers and comments; comments explain the
|
|
||||||
**why** not the **what**.
|
|
||||||
|
|
||||||
Code of conduct: [`CODE_OF_CONDUCT.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CODE_OF_CONDUCT.md).
|
|
||||||
@@ -1,113 +0,0 @@
|
|||||||
# Enrolling your first host
|
|
||||||
|
|
||||||
The control plane only knows about hosts you've explicitly
|
|
||||||
enrolled. Two paths exist:
|
|
||||||
|
|
||||||
1. **Token-based enrolment** — admin generates a token, pastes it
|
|
||||||
into an install command on the host. The host appears immediately,
|
|
||||||
already mapped to the desired repo.
|
|
||||||
2. **Announce-and-approve** — the agent runs without a token,
|
|
||||||
"announces" itself to the server, and a human in the UI accepts
|
|
||||||
the announcement.
|
|
||||||
|
|
||||||
Token-based is the default and what most operators want; the
|
|
||||||
announce flow exists for the case where you can't easily paste a
|
|
||||||
secret onto the host (auto-imaged endpoints, scripted bring-ups
|
|
||||||
from a config repo).
|
|
||||||
|
|
||||||
## Token-based enrolment
|
|
||||||
|
|
||||||
### From the UI
|
|
||||||
|
|
||||||
1. Click **+ Add host** on the dashboard.
|
|
||||||
2. Fill in the hostname, the restic repo URL, and the repo
|
|
||||||
credentials. The credentials are AEAD-encrypted at the server
|
|
||||||
immediately; what you paste is what the agent receives.
|
|
||||||
3. Optionally pick the initial source paths — these become the
|
|
||||||
first source group on the host.
|
|
||||||
4. Submit. The server mints a one-time token and shows you a copy-
|
|
||||||
pasteable install snippet.
|
|
||||||
|
|
||||||
### On the host (Linux)
|
|
||||||
|
|
||||||
```sh
|
|
||||||
curl -fsSL https://restic.example.com/install/install.sh | \
|
|
||||||
sudo RM_SERVER=https://restic.example.com \
|
|
||||||
RM_ENROL_TOKEN=<token> \
|
|
||||||
bash
|
|
||||||
```
|
|
||||||
|
|
||||||
The script:
|
|
||||||
|
|
||||||
1. Detects architecture (`amd64` or `arm64`).
|
|
||||||
2. Downloads the agent binary from `/agent/binary?os=…&arch=…`.
|
|
||||||
3. Drops the systemd unit at
|
|
||||||
`/etc/systemd/system/restic-manager-agent.service`.
|
|
||||||
4. Runs the agent in `-enrol` mode, which posts the token and
|
|
||||||
stores the persistent bearer it gets back.
|
|
||||||
5. Enables and starts the unit.
|
|
||||||
|
|
||||||
Within seconds the host should appear on the dashboard as
|
|
||||||
**online**.
|
|
||||||
|
|
||||||
### On the host (Windows)
|
|
||||||
|
|
||||||
```pwsh
|
|
||||||
$env:RM_SERVER = "https://restic.example.com"
|
|
||||||
$env:RM_ENROL_TOKEN = "<token>"
|
|
||||||
iwr -useb $env:RM_SERVER/install/install.ps1 | iex
|
|
||||||
```
|
|
||||||
|
|
||||||
Equivalent shape: registers a Windows service via the SCM
|
|
||||||
(see P2-16 for details), runs `-enrol`, starts the service.
|
|
||||||
|
|
||||||
## Recovering a lost token
|
|
||||||
|
|
||||||
Tokens are single-use and short-lived (1h). If you closed the tab
|
|
||||||
before pasting the install command, head to the **Add host** page —
|
|
||||||
outstanding tokens are listed there with a **Regenerate** button.
|
|
||||||
Regenerating revokes the old token's hash and mints a fresh raw
|
|
||||||
token while preserving the original repo credentials and initial
|
|
||||||
paths. (NS-02 in `tasks.md` if you want the design rationale.)
|
|
||||||
|
|
||||||
## Announce-and-approve
|
|
||||||
|
|
||||||
If the host can reach the server but you don't want to paste a
|
|
||||||
secret on it, run the agent in `-announce` mode:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
restic-manager-agent -announce \
|
|
||||||
-server https://restic.example.com \
|
|
||||||
-hostname myhost
|
|
||||||
```
|
|
||||||
|
|
||||||
The host appears in the **Pending hosts** panel on the dashboard
|
|
||||||
with its hostname, OS, arch, and the source IP that announced it.
|
|
||||||
Click **Accept**, fill in the repo URL + credentials, and the
|
|
||||||
server pushes the bearer over the still-open WebSocket. No
|
|
||||||
back-and-forth round trip.
|
|
||||||
|
|
||||||
If you don't accept within an hour the announcement is swept.
|
|
||||||
|
|
||||||
## What happens on the agent
|
|
||||||
|
|
||||||
After enrolment, the agent:
|
|
||||||
|
|
||||||
1. Connects via WebSocket to `/ws/agent` with its bearer token.
|
|
||||||
2. Sends a `hello` envelope with its OS, arch, agent version,
|
|
||||||
restic version, and protocol version.
|
|
||||||
3. Receives a `config.update` carrying its encrypted repo
|
|
||||||
credentials and any source-group paths.
|
|
||||||
4. Sits idle, sending a heartbeat every 30s. Operator-driven
|
|
||||||
"Run now" actions arrive as `command.run` envelopes; scheduled
|
|
||||||
jobs are driven by the agent's local cron.
|
|
||||||
|
|
||||||
## Auto-init of the repository
|
|
||||||
|
|
||||||
The first time a backup runs, the agent invokes `restic init`
|
|
||||||
against the repo you configured at enrolment. If the repo already
|
|
||||||
exists (`config file already exists`) the agent treats it as a
|
|
||||||
success and proceeds. The host's repo status (`unknown` →
|
|
||||||
`ready` / `init_failed`) is surfaced under the vitals strip on
|
|
||||||
the host detail page; if init fails, save fresh credentials in
|
|
||||||
the **Repo** tab to retry.
|
|
||||||
@@ -1,92 +0,0 @@
|
|||||||
# Installing the server
|
|
||||||
|
|
||||||
The reference deployment is a single Docker container fronted by
|
|
||||||
your existing reverse proxy. The image bundles the server binary,
|
|
||||||
the cross-compiled agent binaries, and the install scripts.
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
- A Linux host with Docker and Docker Compose.
|
|
||||||
- A reverse proxy in front (Caddy, nginx, Traefik) terminating
|
|
||||||
TLS on a public hostname. The server itself is HTTP-only by
|
|
||||||
design — see [Reverse proxy](./reverse-proxy.md) for why.
|
|
||||||
- A persistent volume for the server's data directory.
|
|
||||||
|
|
||||||
## Quick start
|
|
||||||
|
|
||||||
The reference compose file lives at
|
|
||||||
[`deploy/docker-compose.yml`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/docker-compose.yml):
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
services:
|
|
||||||
restic-manager:
|
|
||||||
image: gitea.dcglab.co.uk/steve/restic-manager:${RM_VERSION:-latest}
|
|
||||||
restart: unless-stopped
|
|
||||||
environment:
|
|
||||||
RM_LISTEN: ":8080"
|
|
||||||
RM_DATA_DIR: "/data"
|
|
||||||
RM_BASE_URL: "https://restic.example.com"
|
|
||||||
# Trust your reverse proxy's CIDR so X-Forwarded-* are honoured.
|
|
||||||
RM_TRUSTED_PROXY: "10.0.0.0/8"
|
|
||||||
volumes:
|
|
||||||
- rm-data:/data
|
|
||||||
ports:
|
|
||||||
# Bind localhost only — your reverse proxy is the public face.
|
|
||||||
- "127.0.0.1:8080:8080"
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
rm-data:
|
|
||||||
```
|
|
||||||
|
|
||||||
Bring it up:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
docker compose up -d
|
|
||||||
docker compose logs -f restic-manager
|
|
||||||
```
|
|
||||||
|
|
||||||
The first run prints a one-time **bootstrap token** to the log. Use
|
|
||||||
it within an hour or it expires; if you miss the window the
|
|
||||||
container print it again on next start as long as no admin user
|
|
||||||
exists.
|
|
||||||
|
|
||||||
## First-run admin setup
|
|
||||||
|
|
||||||
Open `https://restic.example.com/bootstrap` (or whatever your
|
|
||||||
public URL is). Paste the bootstrap token, pick a username and a
|
|
||||||
password (≥ 12 characters), and submit. You'll land in the
|
|
||||||
dashboard logged in as the new admin.
|
|
||||||
|
|
||||||
If you'd rather curl it, the equivalent is:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
curl -X POST https://restic.example.com/api/bootstrap \
|
|
||||||
-H 'Content-Type: application/json' \
|
|
||||||
-d '{"token":"<token-from-log>","username":"admin","password":"<≥12 chars>"}'
|
|
||||||
```
|
|
||||||
|
|
||||||
## Backing up the secret key
|
|
||||||
|
|
||||||
Inside the data volume, `secret.key` holds the AEAD key used to
|
|
||||||
encrypt every credential at rest. **Back it up separately from
|
|
||||||
the database.** Without it, encrypted credentials in the database
|
|
||||||
are unrecoverable; you'd have to re-enrol every host.
|
|
||||||
|
|
||||||
A simple working approach: copy `secret.key` to your password
|
|
||||||
manager or to a separately-backed-up secrets vault the day you
|
|
||||||
install. It doesn't change.
|
|
||||||
|
|
||||||
## Updating the server
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# Pin a new version in your compose file (.env or docker-compose.yml),
|
|
||||||
# then:
|
|
||||||
docker compose pull
|
|
||||||
docker compose up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
Migrations run automatically on startup; the server will refuse to
|
|
||||||
start if a migration fails (better to bail than to half-migrate).
|
|
||||||
|
|
||||||
For the agent self-update story, see
|
|
||||||
[Updating agents](../operations/updates.md).
|
|
||||||
@@ -1,95 +0,0 @@
|
|||||||
# Running behind a reverse proxy
|
|
||||||
|
|
||||||
The restic-manager server is HTTP-only by design. TLS termination,
|
|
||||||
public hostname, ACME, HSTS, and edge-level rate limiting all
|
|
||||||
belong to a reverse proxy you already operate outside this project.
|
|
||||||
|
|
||||||
## What the proxy must forward
|
|
||||||
|
|
||||||
The server reads four headers when (and only when) the immediate
|
|
||||||
peer matches `RM_TRUSTED_PROXY`:
|
|
||||||
|
|
||||||
| Header | Value | Why |
|
|
||||||
|------------------------|----------------------------------------------------|-----|
|
|
||||||
| `X-Forwarded-For` | The original client IP | Rate-limit keys, audit log entries, OIDC redirect-URI checks. |
|
|
||||||
| `X-Forwarded-Proto` | `https` | Used for absolute URLs (e.g. OIDC redirect URIs). |
|
|
||||||
| `Host` | The public hostname clients use | Cookies are scoped to this; `RM_BASE_URL` must match. |
|
|
||||||
| `Connection` / `Upgrade` | Pass through unchanged | `/ws/agent` and `/api/jobs/{id}/stream` are WebSockets; without `Upgrade: websocket` they fail. |
|
|
||||||
|
|
||||||
Set `RM_TRUSTED_PROXY` to the CIDR (or comma-separated list of
|
|
||||||
CIDRs) the proxy connects from. Anything outside that range has
|
|
||||||
its `X-Forwarded-*` headers ignored, so a stray request that
|
|
||||||
bypasses the proxy can't spoof the client IP.
|
|
||||||
|
|
||||||
## Caddy
|
|
||||||
|
|
||||||
```caddyfile
|
|
||||||
restic.example.com {
|
|
||||||
encode zstd gzip
|
|
||||||
reverse_proxy 127.0.0.1:8080 {
|
|
||||||
header_up X-Real-IP {remote_host}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Caddy adds `X-Forwarded-For` / `X-Forwarded-Proto` automatically
|
|
||||||
and passes WebSocket headers through by default, so this is the
|
|
||||||
whole config.
|
|
||||||
|
|
||||||
## nginx
|
|
||||||
|
|
||||||
```nginx
|
|
||||||
server {
|
|
||||||
listen 443 ssl http2;
|
|
||||||
server_name restic.example.com;
|
|
||||||
|
|
||||||
ssl_certificate /etc/letsencrypt/live/restic.example.com/fullchain.pem;
|
|
||||||
ssl_certificate_key /etc/letsencrypt/live/restic.example.com/privkey.pem;
|
|
||||||
|
|
||||||
location / {
|
|
||||||
proxy_pass http://127.0.0.1:8080;
|
|
||||||
proxy_http_version 1.1;
|
|
||||||
proxy_set_header Host $host;
|
|
||||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
|
||||||
proxy_set_header X-Forwarded-Proto https;
|
|
||||||
|
|
||||||
# WebSocket upgrade
|
|
||||||
proxy_set_header Upgrade $http_upgrade;
|
|
||||||
proxy_set_header Connection "upgrade";
|
|
||||||
|
|
||||||
# Long-lived agent WS — disable read timeout for this surface.
|
|
||||||
proxy_read_timeout 86400s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Traefik
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
http:
|
|
||||||
routers:
|
|
||||||
restic-manager:
|
|
||||||
rule: "Host(`restic.example.com`)"
|
|
||||||
entryPoints: [websecure]
|
|
||||||
tls:
|
|
||||||
certResolver: letsencrypt
|
|
||||||
service: restic-manager
|
|
||||||
|
|
||||||
services:
|
|
||||||
restic-manager:
|
|
||||||
loadBalancer:
|
|
||||||
servers:
|
|
||||||
- url: "http://restic-manager:8080"
|
|
||||||
passHostHeader: true
|
|
||||||
```
|
|
||||||
|
|
||||||
Traefik forwards WebSocket upgrades and the standard
|
|
||||||
`X-Forwarded-*` set out of the box.
|
|
||||||
|
|
||||||
## Verification
|
|
||||||
|
|
||||||
After bringing the proxy up, the audit log should show your real
|
|
||||||
client IP for an interactive login (not the proxy's local
|
|
||||||
address). If you see `127.0.0.1` or the proxy's container IP, your
|
|
||||||
`RM_TRUSTED_PROXY` is wrong or `X-Forwarded-For` isn't being
|
|
||||||
forwarded.
|
|
||||||
@@ -1,86 +0,0 @@
|
|||||||
# restic-manager
|
|
||||||
|
|
||||||
restic-manager is a self-hosted, browser-based, single-pane-of-glass
|
|
||||||
for managing [restic](https://restic.net) backups across a fleet of
|
|
||||||
Linux and Windows endpoints. It's designed for **small fleets** —
|
|
||||||
the original target was twelve endpoints — and **one operator**.
|
|
||||||
|
|
||||||
## What it does
|
|
||||||
|
|
||||||
- Centralised view of every endpoint's last backup, repo size,
|
|
||||||
snapshot count, and recent jobs.
|
|
||||||
- Trigger any restic operation remotely (`backup`, `forget`, `prune`,
|
|
||||||
`check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`).
|
|
||||||
- Per-host backup schedules with source groups (named bundles of
|
|
||||||
paths + retention policy).
|
|
||||||
- Live job log streamed to the browser; downloadable as text or NDJSON.
|
|
||||||
- Restore wizard with snapshot tree browse + path selection.
|
|
||||||
- Repo-level health surfacing (size, raw size, last-check, lock
|
|
||||||
state) plus a 30/90-day size trend.
|
|
||||||
- Alerting over webhook, ntfy, or SMTP.
|
|
||||||
- Cross-platform agent (Linux + Windows).
|
|
||||||
- Append-only-credential-friendly with a separate admin credential
|
|
||||||
for forget/prune.
|
|
||||||
|
|
||||||
## What it isn't
|
|
||||||
|
|
||||||
- **Not a SaaS.** Single-instance, single-tenant, by design.
|
|
||||||
- **Not a replacement for restic** — it's a control plane. The agent
|
|
||||||
shells out to a real `restic` binary.
|
|
||||||
- **Not highly available.** SQLite, single process; if you need
|
|
||||||
HA backups, you're shopping in the wrong aisle.
|
|
||||||
- **Not a multi-protocol backup tool.** restic only.
|
|
||||||
|
|
||||||
## How it fits together
|
|
||||||
|
|
||||||
```
|
|
||||||
┌──────────────────────────────────────────────┐
|
|
||||||
│ Server (control plane, Docker) │
|
|
||||||
│ - REST + WebSocket API │
|
|
||||||
│ - SQLite store │
|
|
||||||
│ - Embedded HTMX UI │
|
|
||||||
└──────────┬─────────────────────────┬─────────┘
|
|
||||||
│ outbound WS │ HTTP(S)
|
|
||||||
│ │
|
|
||||||
┌──────────▼──────────┐ ┌──────────▼─────────┐
|
|
||||||
│ Agent (per host) │ │ Browser (operator) │
|
|
||||||
│ - restic wrapper │ └─────────────────────┘
|
|
||||||
│ - cron for sched. │
|
|
||||||
└──────────┬──────────┘
|
|
||||||
│ restic
|
|
||||||
┌──────────▼──────────────────────────────────┐
|
|
||||||
│ rest-server / S3 / SFTP / local repo │
|
|
||||||
│ (the actual backup data — server never │
|
|
||||||
│ touches it) │
|
|
||||||
└─────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
The control plane is a Go binary that runs in Docker. Each endpoint
|
|
||||||
runs a small Go agent that holds an outbound WebSocket to the
|
|
||||||
control plane. Backup data flows directly between the agent and the
|
|
||||||
restic repository — the control plane never sees a snapshot byte.
|
|
||||||
|
|
||||||
## Where to start
|
|
||||||
|
|
||||||
- [Installing the server](./getting-started/install.md) walks
|
|
||||||
through the Docker-based reference deployment.
|
|
||||||
- [Enrolling your first host](./getting-started/enrolling-hosts.md)
|
|
||||||
covers the install scripts and the announce-and-approve flow.
|
|
||||||
- [Architecture](./concepts/architecture.md) is the right read if
|
|
||||||
you want to know why something is the way it is before running
|
|
||||||
the install.
|
|
||||||
|
|
||||||
## Project status
|
|
||||||
|
|
||||||
Pre-1.0 but feature-complete for the original use case. Phases
|
|
||||||
0–4 are landed (MVP, scheduling, restore, RBAC + OIDC); Phase 5
|
|
||||||
(this docs site, contributor onboarding, end-to-end CI) is in
|
|
||||||
flight. See [`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md)
|
|
||||||
for the live roadmap and [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md)
|
|
||||||
for the canonical design doc.
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
[PolyForm Noncommercial 1.0.0](https://polyformproject.org/licenses/noncommercial/1.0.0/).
|
|
||||||
Personal and community deployments welcome; commercial use
|
|
||||||
requires a separate license.
|
|
||||||
@@ -1,39 +0,0 @@
|
|||||||
# License
|
|
||||||
|
|
||||||
restic-manager is licensed under
|
|
||||||
[**PolyForm Noncommercial 1.0.0**](https://polyformproject.org/licenses/noncommercial/1.0.0/).
|
|
||||||
The full text lives at
|
|
||||||
[`LICENSE`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/LICENSE)
|
|
||||||
in the repository root.
|
|
||||||
|
|
||||||
## What this means
|
|
||||||
|
|
||||||
- **Personal, hobbyist, educational, charitable, and similar
|
|
||||||
noncommercial use** is fully permitted, including modification
|
|
||||||
and redistribution.
|
|
||||||
- **Commercial use is not permitted** without a separate
|
|
||||||
license. The maintainer is not currently offering one — if
|
|
||||||
you need commercial rights, open an issue to start the
|
|
||||||
conversation.
|
|
||||||
- The license is permissive about everything except commercial
|
|
||||||
use: you can fork, modify, deploy in your home/lab, and
|
|
||||||
contribute back.
|
|
||||||
|
|
||||||
## Why this license
|
|
||||||
|
|
||||||
The PolyForm Noncommercial license was chosen because:
|
|
||||||
|
|
||||||
- It's a real, legal, plainly-worded license (not a custom
|
|
||||||
half-written variant).
|
|
||||||
- It permits the realistic uses for a hobby project (the
|
|
||||||
maintainer's homelab, a friend's fleet, a charity's IT
|
|
||||||
closet) without inviting commercial vendors to repackage
|
|
||||||
the work.
|
|
||||||
- It's compatible with the project staying small and
|
|
||||||
maintainable — the maintainer doesn't want to be on the hook
|
|
||||||
for SLA-grade commercial support.
|
|
||||||
|
|
||||||
## Contributions
|
|
||||||
|
|
||||||
By contributing, you agree your contributions are licensed
|
|
||||||
under the same PolyForm Noncommercial 1.0.0 license.
|
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
# Alerts and notifications
|
|
||||||
|
|
||||||
restic-manager raises alerts on conditions that need human
|
|
||||||
attention. The alert engine evaluates rules on a 60s tick and
|
|
||||||
on every job-finished / host-online event.
|
|
||||||
|
|
||||||
## Built-in alert kinds
|
|
||||||
|
|
||||||
| Kind | Trigger | Severity |
|
|
||||||
|---------------------|---------|----------|
|
|
||||||
| `backup_failed` | A backup job ends in `failed` or `cancelled` | warning |
|
|
||||||
| `forget_failed` | A forget job ends in `failed` | warning |
|
|
||||||
| `prune_failed` | A prune job ends in `failed` | critical |
|
|
||||||
| `check_failed` | A check job ends in `failed` | critical |
|
|
||||||
| `agent_offline` | A host has been offline more than 90s past its heartbeat cadence | warning |
|
|
||||||
| `stale_schedule` | A schedule's "last run" is more than 1.5 × its interval ago | warning |
|
|
||||||
| `update_failed` | An agent self-update returned a fail or didn't reconnect within 90s | warning |
|
|
||||||
| `fleet_update_halted`| The rolling fleet-update worker stopped on a failure | critical |
|
|
||||||
|
|
||||||
Each alert has a `dedup_key` so re-firing the same condition
|
|
||||||
just bumps `last_seen_at` — the operator gets one row per
|
|
||||||
condition, not a thousand.
|
|
||||||
|
|
||||||
## Lifecycle
|
|
||||||
|
|
||||||
```
|
|
||||||
raised ──acknowledge──▶ acknowledged ──resolve──▶ resolved
|
|
||||||
│ │
|
|
||||||
└────────auto-resolve──────┘
|
|
||||||
(e.g. agent_offline auto-resolves on agent_online)
|
|
||||||
```
|
|
||||||
|
|
||||||
- **Acknowledge** says "I've seen this, stop notifying about it".
|
|
||||||
- **Resolve** says "the underlying condition is gone".
|
|
||||||
- Some alerts auto-resolve when the condition clears
|
|
||||||
(`agent_offline` is the canonical example).
|
|
||||||
|
|
||||||
## Notification channels
|
|
||||||
|
|
||||||
Configure under **Settings → Notifications**. Each channel can
|
|
||||||
subscribe to all alerts or filter by severity.
|
|
||||||
|
|
||||||
### Webhook
|
|
||||||
|
|
||||||
Posts a JSON envelope to a URL of your choice. Useful for
|
|
||||||
piping into Slack via an Incoming Webhook URL or into your own
|
|
||||||
alerting tooling.
|
|
||||||
|
|
||||||
### ntfy
|
|
||||||
|
|
||||||
Pushes a plain-text alert to an [ntfy.sh](https://ntfy.sh/)
|
|
||||||
topic. Configure the topic URL; optional bearer token if you
|
|
||||||
self-host with auth.
|
|
||||||
|
|
||||||
### SMTP
|
|
||||||
|
|
||||||
Plain SMTP (with optional TLS). Configure host, port,
|
|
||||||
username, password, and the recipient list.
|
|
||||||
|
|
||||||
## Test fire
|
|
||||||
|
|
||||||
Each channel exposes a **Test fire** button that dispatches a
|
|
||||||
single synthetic alert through the channel without touching the
|
|
||||||
alert engine. Use this when you've added a channel and want to
|
|
||||||
verify connectivity before the next real failure happens.
|
|
||||||
|
|
||||||
## What gets logged
|
|
||||||
|
|
||||||
Every alert raise / acknowledge / resolve writes an audit log
|
|
||||||
entry. The audit log UI at **Settings → Audit log** filters by
|
|
||||||
user, action, target, and time range — useful for the
|
|
||||||
post-incident "who clicked acknowledge on the prune-failure
|
|
||||||
alert" question.
|
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
# Backups and restores
|
|
||||||
|
|
||||||
## Running a backup
|
|
||||||
|
|
||||||
Three ways to trigger one:
|
|
||||||
|
|
||||||
1. **Scheduled** — the agent's local cron fires at the time set
|
|
||||||
on the schedule.
|
|
||||||
2. **Run-now** — operator clicks **Run now** on the host detail
|
|
||||||
right rail. Posts to `/hosts/{id}/run-backup` (defaults to all
|
|
||||||
source groups) or to a per-group form for finer control.
|
|
||||||
3. **API** — `POST /api/hosts/{id}/jobs` with the appropriate
|
|
||||||
payload. Same audit + dispatch path.
|
|
||||||
|
|
||||||
In every case the server creates a `jobs` row, broadcasts a
|
|
||||||
`command.run` to the host, and lands the operator on the live
|
|
||||||
job log page (HTMX `HX-Redirect`).
|
|
||||||
|
|
||||||
## Cancelling a job
|
|
||||||
|
|
||||||
Any running job — backup, forget, prune, restore, anything —
|
|
||||||
exposes a **Cancel** button on its detail page. The server
|
|
||||||
broadcasts `command.cancel`, and the agent kills the running
|
|
||||||
restic subprocess via context cancel: SIGTERM first, SIGKILL
|
|
||||||
after a 5s grace (`cmd.Cancel` + `cmd.WaitDelay`). On Windows the
|
|
||||||
SIGTERM step is replaced with `os.Kill` because Windows can't
|
|
||||||
deliver SIGTERM. Result: a cancelled job lands as `cancelled`
|
|
||||||
within a couple of hundred milliseconds.
|
|
||||||
|
|
||||||
## Restore wizard
|
|
||||||
|
|
||||||
Restoring a file or path goes through a four-step wizard at
|
|
||||||
`/hosts/{id}/restore`:
|
|
||||||
|
|
||||||
1. **Pick a snapshot.** Search by id or by date; the page is
|
|
||||||
pre-populated when you launched the wizard from a snapshot row.
|
|
||||||
2. **Browse the snapshot tree.** Lazy-loaded children via the
|
|
||||||
`MsgTreeList` synchronous WS RPC; results are cached
|
|
||||||
per-wizard-session for 30 minutes. Pick the absolute paths
|
|
||||||
you want.
|
|
||||||
3. **Choose a target.** Either **In place** (overwrites the
|
|
||||||
live filesystem; requires you to type the hostname to
|
|
||||||
confirm) or **New directory** (default
|
|
||||||
`$HOME/rm-restore/<job-id>/`; agent expands `$HOME` /
|
|
||||||
`${HOME}` / `~/` and creates the directory chain).
|
|
||||||
4. **Review and submit.** Server mints a job, dispatches
|
|
||||||
`command.run` with a `RestorePayload`, and `HX-Redirect`s to
|
|
||||||
the live job log.
|
|
||||||
|
|
||||||
`--no-ownership` is gated on restic ≥ 0.17 (the flag was added
|
|
||||||
in that release). Hosts running 0.16 don't get the flag and
|
|
||||||
restore as the running user instead.
|
|
||||||
|
|
||||||
## Snapshot diff
|
|
||||||
|
|
||||||
Two snapshot ids in the **Diff** form on the host detail page →
|
|
||||||
a `JobDiff` job that runs `restic diff <a> <b>`. Output streams
|
|
||||||
to the standard live job log. Useful when investigating a
|
|
||||||
suspiciously-sized backup.
|
|
||||||
|
|
||||||
## Job log artefacts
|
|
||||||
|
|
||||||
Every job's log is persisted in `job_logs` (one row per line),
|
|
||||||
not just streamed in-memory. That gives you:
|
|
||||||
|
|
||||||
- A live view at `/jobs/{id}` while the job runs.
|
|
||||||
- Two download formats from the same page header dropdown:
|
|
||||||
- **txt** — one line per row, `HH:MM:SS.mmm TAG payload`.
|
|
||||||
- **ndjson** — one self-contained JSON object per line
|
|
||||||
(`{seq, ts, stream, payload}`), perfect for `jq`.
|
|
||||||
|
|
||||||
Downloads work whether the job is running or finished —
|
|
||||||
the source is the DB, not the live socket.
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
# Observability with Prometheus
|
|
||||||
|
|
||||||
restic-manager can expose a Prometheus scrape endpoint at
|
|
||||||
`GET /metrics`. The endpoint is **opt-in** — without an explicit
|
|
||||||
auth gate it isn't even mounted, so a forgotten config can't
|
|
||||||
accidentally publish fleet state.
|
|
||||||
|
|
||||||
The full reference lives at
|
|
||||||
[`docs/prometheus.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/docs/prometheus.md);
|
|
||||||
the short version follows.
|
|
||||||
|
|
||||||
## Enable the endpoint
|
|
||||||
|
|
||||||
Set at least one of:
|
|
||||||
|
|
||||||
- `RM_METRICS_TOKEN` — `Authorization: Bearer <token>` required.
|
|
||||||
- `RM_METRICS_TRUSTED_CIDR` — restricts source IPs (comma-CIDR).
|
|
||||||
|
|
||||||
Both ANDed when both set. Constant-time token compare; CIDR
|
|
||||||
honours `X-Forwarded-For` only when the immediate hop matches
|
|
||||||
`RM_TRUSTED_PROXY`.
|
|
||||||
|
|
||||||
## Metrics emitted
|
|
||||||
|
|
||||||
- **Server gauges**: `rm_hosts_total`, `rm_hosts_online`,
|
|
||||||
`rm_active_alerts{severity}`, `rm_build_info{...}`.
|
|
||||||
- **Per-host gauges**: `rm_host_agent_online`,
|
|
||||||
`rm_host_last_backup_timestamp_seconds`,
|
|
||||||
`rm_host_last_backup_success`, `rm_host_repo_size_bytes`,
|
|
||||||
`rm_host_snapshot_count`, `rm_host_open_alerts`,
|
|
||||||
`rm_host_repo_status`.
|
|
||||||
- **Histogram**:
|
|
||||||
`rm_job_duration_seconds{kind,status,le=…}` (buckets
|
|
||||||
`1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf`).
|
|
||||||
|
|
||||||
In-memory histogram only. Prometheus persists the scrapes; if
|
|
||||||
you need durable history at hourly resolution that's
|
|
||||||
Prometheus's job.
|
|
||||||
|
|
||||||
## Sample Grafana dashboard
|
|
||||||
|
|
||||||
[`deploy/grafana/restic-manager-dashboard.json`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/grafana/restic-manager-dashboard.json)
|
|
||||||
imports through Grafana's **+ → Import → Upload JSON file**.
|
|
||||||
Six panels:
|
|
||||||
|
|
||||||
1. Fleet status (online / total).
|
|
||||||
2. Open alerts by severity.
|
|
||||||
3. Backups failing on most-recent run.
|
|
||||||
4. Hosts table — last backup, repo size, snapshots, open alerts.
|
|
||||||
5. Repo size over time, one line per host.
|
|
||||||
6. Job-duration p95 over a 1h window per kind.
|
|
||||||
|
|
||||||
## Alerting
|
|
||||||
|
|
||||||
restic-manager already has a built-in alert engine
|
|
||||||
([Alerts](./alerts.md)). The dashboard intentionally doesn't
|
|
||||||
duplicate it as Prometheus alert rules. If you want
|
|
||||||
Prometheus-side alerts on top, write your own based on the
|
|
||||||
metrics above — `rm_host_last_backup_success == 0`,
|
|
||||||
`time() - rm_host_last_backup_timestamp_seconds > <max age>`,
|
|
||||||
or whatever suits your environment.
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
# Updating agents
|
|
||||||
|
|
||||||
Server updates are a `docker compose pull && up -d` away.
|
|
||||||
Agents update via the control plane.
|
|
||||||
|
|
||||||
## Single-host update
|
|
||||||
|
|
||||||
Each host's detail page shows an **Update agent** button when
|
|
||||||
the agent's reported version is older than the server's. The
|
|
||||||
button:
|
|
||||||
|
|
||||||
1. Dispatches a `command.update` to that host.
|
|
||||||
2. The agent fetches the appropriate binary from
|
|
||||||
`$RM_SERVER/agent/binary?os=…&arch=…` to
|
|
||||||
`<binary-path>.new`.
|
|
||||||
3. Copies the running binary to `<binary-path>.old` (one
|
|
||||||
revision back, in case rollback is needed).
|
|
||||||
4. Atomic-renames `.new` over the running binary.
|
|
||||||
5. Exits cleanly. systemd's `Restart=always` (or Windows SCM)
|
|
||||||
brings the process back on the new binary.
|
|
||||||
|
|
||||||
A 90-second timer on the server side waits for a hello at the
|
|
||||||
target version and marks the update succeeded — or, if the
|
|
||||||
agent doesn't reconnect at the expected version in time, marks
|
|
||||||
the update **failed** and raises an `update_failed` alert.
|
|
||||||
|
|
||||||
## Fleet update
|
|
||||||
|
|
||||||
The admin-only **Settings → Fleet update** page drives a rolling
|
|
||||||
update across every host in the fleet:
|
|
||||||
|
|
||||||
- One host at a time.
|
|
||||||
- Wait for hello-with-target-version (max 95s).
|
|
||||||
- On any host failing, **halt** the rollout, raise a
|
|
||||||
`fleet_update_halted` alert, leave the rest of the fleet on
|
|
||||||
the old version. No surprise mass-failures.
|
|
||||||
|
|
||||||
You can cancel an in-progress fleet update; the worker stops
|
|
||||||
after the current host finishes.
|
|
||||||
|
|
||||||
## TLS and corruption
|
|
||||||
|
|
||||||
Updates rely on the reverse proxy's TLS to detect corruption in
|
|
||||||
transit. There's no separate sha256 verification step — we
|
|
||||||
chose the simpler model on the basis that the same TLS already
|
|
||||||
gates every other byte the server hands to the agent.
|
|
||||||
|
|
||||||
If you'd like a separate signature step before applying updates,
|
|
||||||
that's a future-phase enhancement (see `tasks.md` Phase 6
|
|
||||||
candidates).
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
# Environment variables
|
|
||||||
|
|
||||||
The server reads its configuration from environment variables
|
|
||||||
(canonical) with an optional YAML overlay. Env wins over YAML so
|
|
||||||
operators can tweak a single setting without rewriting the file.
|
|
||||||
|
|
||||||
## Server
|
|
||||||
|
|
||||||
| Variable | Default | Meaning |
|
|
||||||
|---------------------------|----------------------------------|---------|
|
|
||||||
| `RM_LISTEN` | `:8080` | TCP listener for the HTTP server. |
|
|
||||||
| `RM_DATA_DIR` | `/data` | Persistent state directory (SQLite, secret key, agent assets). |
|
|
||||||
| `RM_BASE_URL` | (none) | Public URL clients use; required for OIDC redirects + cookie scope. |
|
|
||||||
| `RM_SECRET_KEY_FILE` | `${RM_DATA_DIR}/secret.key` | Path to the AEAD key file. Auto-generated on first run. |
|
|
||||||
| `RM_COOKIE_SECURE` | `true` | Set `false` only for local HTTP testing. Controls `Secure` on session cookies. |
|
|
||||||
| `RM_TRUSTED_PROXY` | (none) | Comma-separated CIDRs trusted for `X-Forwarded-*`. |
|
|
||||||
| `RM_BUNDLED_ASSETS_DIR` | `/opt/restic-manager/dist` | Read-only path with bundled agent binaries + install scripts (the docker image bakes them here). |
|
|
||||||
| `RM_METRICS_TOKEN` | (off) | When set, `GET /metrics` requires `Authorization: Bearer <token>`. |
|
|
||||||
| `RM_METRICS_TRUSTED_CIDR` | (off) | When set, `GET /metrics` restricts source IPs (comma-CIDR). |
|
|
||||||
|
|
||||||
OIDC variables (all optional; empty issuer disables OIDC):
|
|
||||||
|
|
||||||
| Variable | Meaning |
|
|
||||||
|--------------------------------|---------|
|
|
||||||
| `RM_OIDC_ISSUER` | OIDC discovery URL (e.g. `https://auth.example.com`). |
|
|
||||||
| `RM_OIDC_CLIENT_ID` | Client ID registered with the IdP. |
|
|
||||||
| `RM_OIDC_CLIENT_SECRET` | Client secret (or use `RM_OIDC_CLIENT_SECRET_FILE`). |
|
|
||||||
| `RM_OIDC_CLIENT_SECRET_FILE` | Path to a file holding the client secret. |
|
|
||||||
| `RM_OIDC_DISPLAY_NAME` | Button label on the login page (e.g. "Authelia"). |
|
|
||||||
| `RM_OIDC_ROLE_CLAIM` | Token claim that carries roles (default `groups`). |
|
|
||||||
| `RM_OIDC_ROLE_MAPPING` | `idp-group=role` entries, comma-separated (e.g. `rm-admin=admin,rm-ops=operator`). |
|
|
||||||
| `RM_OIDC_REDIRECT_URL` | Override for the redirect URL; defaults to `${RM_BASE_URL}/auth/oidc/callback`. |
|
|
||||||
|
|
||||||
## Agent
|
|
||||||
|
|
||||||
| Variable | Default | Meaning |
|
|
||||||
|----------------------|---------|---------|
|
|
||||||
| `RM_AGENT_CONFIG` | `/etc/restic-manager/agent.yaml` (Linux) | Config file path. |
|
|
||||||
|
|
||||||
The agent's other settings live in the YAML file (server URL,
|
|
||||||
bearer token, optional cert pin). The install script writes that
|
|
||||||
file for you at enrolment.
|
|
||||||
|
|
||||||
## Build-time
|
|
||||||
|
|
||||||
The Makefile threads `-ldflags` from `git describe` into the
|
|
||||||
`internal/version` package so `--version` and the dashboard
|
|
||||||
footer show the right values:
|
|
||||||
|
|
||||||
```
|
|
||||||
-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION)
|
|
||||||
-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT)
|
|
||||||
```
|
|
||||||
|
|
||||||
If you build with `go build` directly (no Makefile), `Version`
|
|
||||||
falls back to `dev` and the agent-update comparison falls back
|
|
||||||
to "always equal". Source-build deployments can still run; they
|
|
||||||
just don't participate in the self-update flow.
|
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
# HTTP endpoints
|
|
||||||
|
|
||||||
A non-exhaustive map of the surfaces the control plane exposes.
|
|
||||||
All `/api/*` routes return JSON; all other paths render HTML
|
|
||||||
(server-rendered with HTMX in the loop).
|
|
||||||
|
|
||||||
The canonical wiring lives at
|
|
||||||
[`internal/server/http/server.go`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/internal/server/http/server.go);
|
|
||||||
when in doubt, read the routes block there.
|
|
||||||
|
|
||||||
## Public (no auth)
|
|
||||||
|
|
||||||
| Method | Path | Purpose |
|
|
||||||
|--------|----------------------------|---------|
|
|
||||||
| GET | `/healthz` | Liveness probe. Returns 204. |
|
|
||||||
| POST | `/api/auth/login` | Local-user login. JSON body: `{username, password}`. |
|
|
||||||
| POST | `/api/auth/logout` | Invalidate the session cookie. |
|
|
||||||
| POST | `/api/bootstrap` | First-run admin creation. Accepts the token printed at first start. |
|
|
||||||
| POST | `/api/agents/enroll` | Token-based agent enrolment. |
|
|
||||||
| POST | `/api/agents/announce` | Announce-and-approve agent enrolment. |
|
|
||||||
| GET | `/agent/binary?os=&arch=` | Serves the agent binary for the install scripts. |
|
|
||||||
| GET | `/install/*` | Serves the Linux + Windows install scripts and the systemd unit. |
|
|
||||||
| GET | `/api/version` | Build version + commit JSON. |
|
|
||||||
| GET | `/metrics` | Prometheus exposition (only when opted-in via `RM_METRICS_TOKEN` / `RM_METRICS_TRUSTED_CIDR`). |
|
|
||||||
| GET | `/login`, `/setup`, `/bootstrap` | UI pages. |
|
|
||||||
|
|
||||||
## Authenticated (any role)
|
|
||||||
|
|
||||||
| Method | Path | Purpose |
|
|
||||||
|--------|------------------------------------------|---------|
|
|
||||||
| GET | `/` | Dashboard. |
|
|
||||||
| GET | `/hosts/{id}` | Host detail. |
|
|
||||||
| GET | `/hosts/{id}/repo` | Repo tab. |
|
|
||||||
| GET | `/hosts/{id}/jobs` | Jobs tab. |
|
|
||||||
| GET | `/hosts/{id}/sources` | Source groups list. |
|
|
||||||
| GET | `/hosts/{id}/schedules` | Schedules list. |
|
|
||||||
| GET | `/jobs/{id}` | Live job log. |
|
|
||||||
| GET | `/api/hosts`, `/api/fleet/summary` | JSON list + summary. |
|
|
||||||
| GET | `/api/jobs/{id}/stream` | WebSocket subscription to a job's live log. |
|
|
||||||
| GET | `/api/jobs/{id}/log.{txt,ndjson}` | Persisted log download. |
|
|
||||||
|
|
||||||
## Operator role and above
|
|
||||||
|
|
||||||
| Method | Path | Purpose |
|
|
||||||
|--------|---------------------------------------|---------|
|
|
||||||
| POST | `/hosts/{id}/run-backup` | Run-now (HTMX form-post). |
|
|
||||||
| POST | `/hosts/{id}/sources/{gid}/run-now` | Per-source-group run-now. |
|
|
||||||
| POST | `/hosts/{id}/repo/{prune,check,unlock,reinit,probe}` | Maintenance actions. |
|
|
||||||
| POST | `/api/hosts/{id}/snapshots/diff` | Snapshot-diff job. |
|
|
||||||
| POST | `/hosts/{id}/restore` | Restore wizard submit. |
|
|
||||||
| POST | `/api/jobs/{id}/cancel` | Cancel a running job. |
|
|
||||||
| POST | `/hosts/{id}/tags` | Update host tags. |
|
|
||||||
| POST | `/hosts/{id}/sources` and friends | Source-group CRUD. |
|
|
||||||
| POST | `/hosts/{id}/schedules` and friends | Schedule CRUD. |
|
|
||||||
| POST | `/hosts/{id}/repo/credentials`, `/admin-credentials` | Credential update. |
|
|
||||||
|
|
||||||
## Admin role only
|
|
||||||
|
|
||||||
| Method | Path | Purpose |
|
|
||||||
|--------|---------------------------------------|---------|
|
|
||||||
| POST | `/hosts/new` | Mint enrolment token (Add host). |
|
|
||||||
| POST | `/hosts/{id}/delete` | Delete + cascade. |
|
|
||||||
| POST | `/hosts/{id}/update` | Dispatch a single agent update. |
|
|
||||||
| GET/POST | `/settings/users/...` | User management. |
|
|
||||||
| POST | `/settings/notifications/...` | Notification channel CRUD + test fire. |
|
|
||||||
| POST | `/settings/fleet-update/...` | Fleet-update worker. |
|
|
||||||
|
|
||||||
## WebSocket
|
|
||||||
|
|
||||||
| Path | Who connects | Auth |
|
|
||||||
|--------------------------------|--------------|------|
|
|
||||||
| `/ws/agent` | Agent | Bearer token issued at enrolment. |
|
|
||||||
| `/ws/agent/pending` | Agent (announce flow) | Pending-id query param. |
|
|
||||||
| `/api/jobs/{id}/stream` | Browser | Session cookie. |
|
|
||||||
|
|
||||||
## RBAC enforcement
|
|
||||||
|
|
||||||
Routes are grouped into chi route-groups by required role
|
|
||||||
(`viewer < operator < admin`); the `requireRole` middleware in
|
|
||||||
`internal/server/http/middleware.go` is the bouncer. Sessions
|
|
||||||
re-validate `disabled_at` on every request, so a disabled user's
|
|
||||||
cookie stops working immediately.
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
# Roadmap
|
|
||||||
|
|
||||||
The live roadmap is in
|
|
||||||
[`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md).
|
|
||||||
Phases ship in order; items inside a phase ship as the
|
|
||||||
opportunity arises.
|
|
||||||
|
|
||||||
## Status snapshot
|
|
||||||
|
|
||||||
| Phase | Theme | Status |
|
|
||||||
|-------|--------------------------------------------------|--------|
|
|
||||||
| 0 | Project bootstrap | ✅ done |
|
|
||||||
| 1 | MVP: enrolment, visibility, on-demand backup | ✅ done |
|
|
||||||
| 2 | Scheduling, retention, repo operations | ✅ done |
|
|
||||||
| 3 | Restore, alerts, audit | ✅ done |
|
|
||||||
| 4 | RBAC, OIDC, host tags | ✅ done |
|
|
||||||
| 5 | OSS readiness | 🚧 in flight (this docs site is part of it) |
|
|
||||||
| 6 | Update delivery + observability polish | ✅ done |
|
|
||||||
|
|
||||||
## What's not on the roadmap
|
|
||||||
|
|
||||||
The non-goals list in [`spec.md` §2](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md):
|
|
||||||
|
|
||||||
- Replacing restic itself or providing custom repo formats
|
|
||||||
- Managing non-restic backup tools
|
|
||||||
- Multi-tenancy / SaaS deployment
|
|
||||||
- High availability of the control plane (SQLite, single-instance)
|
|
||||||
- Mobile-native apps (responsive web only)
|
|
||||||
|
|
||||||
If something there is critical to your use case, restic-manager
|
|
||||||
isn't the right tool. That's not a closed door — it's a
|
|
||||||
deliberate scope decision so the project stays maintainable.
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
# Reporting vulnerabilities
|
|
||||||
|
|
||||||
The full disclosure policy lives in
|
|
||||||
[`SECURITY.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/SECURITY.md)
|
|
||||||
at the repo root. The short version:
|
|
||||||
|
|
||||||
- **Don't open a public issue.**
|
|
||||||
- Send a Gitea private message to `steve` on
|
|
||||||
<https://gitea.dcglab.co.uk>, or email the address on the
|
|
||||||
maintainer's profile, with a subject like
|
|
||||||
`[SECURITY] restic-manager: <one-line summary>`.
|
|
||||||
- Expect an acknowledgement within 3 working days; escalate
|
|
||||||
through the other channel if you don't get one.
|
|
||||||
- Default disclosure window is **30 days from confirmed report
|
|
||||||
to public disclosure**, faster if a PoC is already
|
|
||||||
circulating, slower only by mutual agreement.
|
|
||||||
|
|
||||||
## What to include
|
|
||||||
|
|
||||||
A description of the issue and the impact, the affected
|
|
||||||
component (server / agent / install script / docs), the version,
|
|
||||||
and reproduction steps. A working PoC is welcome but not
|
|
||||||
required — a credible threat model is enough.
|
|
||||||
|
|
||||||
## In scope vs. out of scope
|
|
||||||
|
|
||||||
See the full policy. Quick highlights:
|
|
||||||
|
|
||||||
- **In scope:** server, agent, install scripts, docker image,
|
|
||||||
docker-compose reference, crypto choices, docs that lead to
|
|
||||||
insecure configs.
|
|
||||||
- **Out of scope:** restic itself (report upstream), unpatched
|
|
||||||
third-party deps (report upstream first), pre-authenticated
|
|
||||||
admin abuse (admins are designed to have full power), DoS on
|
|
||||||
deployments without the recommended reverse proxy.
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
# Hardening checklist
|
|
||||||
|
|
||||||
A baseline for new deployments. Most of these are defaults; the
|
|
||||||
list is here to make audit easy.
|
|
||||||
|
|
||||||
## Server
|
|
||||||
|
|
||||||
- [ ] Reverse proxy in front, TLS terminating at the proxy
|
|
||||||
(Caddy/nginx/Traefik).
|
|
||||||
- [ ] `RM_TRUSTED_PROXY` set to the proxy's CIDR.
|
|
||||||
- [ ] `RM_BASE_URL` matches the public hostname and the cookie
|
|
||||||
scope you want.
|
|
||||||
- [ ] `RM_COOKIE_SECURE=true` (the default; only set `false`
|
|
||||||
for local HTTP testing).
|
|
||||||
- [ ] HTTP listener bound to **localhost** in the compose file,
|
|
||||||
not `0.0.0.0`. The reverse proxy is the only thing that
|
|
||||||
should reach it.
|
|
||||||
- [ ] `secret.key` backed up separately from the database.
|
|
||||||
- [ ] Bootstrap token consumed and the printed log line scrubbed
|
|
||||||
from any log archive.
|
|
||||||
|
|
||||||
## Authentication
|
|
||||||
|
|
||||||
- [ ] Admin user has a password ≥ 12 characters (the floor).
|
|
||||||
- [ ] OIDC enabled if you have an IdP — local password auth
|
|
||||||
stays as a break-glass.
|
|
||||||
- [ ] Disabled (not deleted) any users who change roles or leave
|
|
||||||
so their session is invalidated immediately.
|
|
||||||
- [ ] The last-admin guard isn't tripped — there's always at
|
|
||||||
least one enabled admin user.
|
|
||||||
|
|
||||||
## Repo credentials
|
|
||||||
|
|
||||||
- [ ] Append-only credential set as the everyday cred for every
|
|
||||||
host.
|
|
||||||
- [ ] Admin credential set only where prune cadence is enabled.
|
|
||||||
- [ ] No credentials reused across hosts. Each host should have
|
|
||||||
its own credential pair so a single host compromise has a
|
|
||||||
single blast radius.
|
|
||||||
- [ ] If using rest-server, `--append-only` flag is on for the
|
|
||||||
everyday user; the prune user is a separate identity.
|
|
||||||
|
|
||||||
## Agent
|
|
||||||
|
|
||||||
- [ ] Agent runs as `root` (Linux) or `LocalSystem` (Windows)
|
|
||||||
**only when** the source paths require it. Otherwise pin
|
|
||||||
a service user that has read access to what's backed up
|
|
||||||
and nothing else.
|
|
||||||
- [ ] systemd unit's sandboxing flags are intact
|
|
||||||
(`NoNewPrivileges`, `Protect*`, `MemoryDenyWriteExecute`).
|
|
||||||
- [ ] Agent's config file `/etc/restic-manager/agent.yaml` is
|
|
||||||
mode `0600` and owned by the service user. The bearer
|
|
||||||
token lives in there.
|
|
||||||
|
|
||||||
## Operations
|
|
||||||
|
|
||||||
- [ ] Alerts wired to a real channel (webhook into Slack,
|
|
||||||
ntfy topic, SMTP) — not just sitting in the UI.
|
|
||||||
- [ ] Test-fire each notification channel after configuring.
|
|
||||||
- [ ] Audit-log retention is long enough to cover the operator's
|
|
||||||
incident-response window.
|
|
||||||
- [ ] Prometheus endpoint, if enabled, gated by token AND CIDR
|
|
||||||
where practical (default is opt-in / off).
|
|
||||||
|
|
||||||
## Recovery
|
|
||||||
|
|
||||||
- [ ] A documented procedure for rotating a leaked agent bearer
|
|
||||||
(delete + re-enrol the host).
|
|
||||||
- [ ] A test-restore done at least once, end-to-end, before
|
|
||||||
relying on the system in anger.
|
|
||||||
- [ ] `secret.key` and the SQLite database covered by separate
|
|
||||||
backup paths so neither alone reconstitutes the other.
|
|
||||||
@@ -1,110 +0,0 @@
|
|||||||
# Threat model
|
|
||||||
|
|
||||||
This page documents what restic-manager defends against, what it
|
|
||||||
doesn't, and the trust assumptions a deployment is making. The
|
|
||||||
canonical version lives in [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md)
|
|
||||||
§11; the summary here is shaped for operators rather than
|
|
||||||
implementers.
|
|
||||||
|
|
||||||
## Trust boundaries
|
|
||||||
|
|
||||||
```
|
|
||||||
┌──────────────────────────────────────────┐
|
|
||||||
│ TRUSTED zone │
|
|
||||||
│ ┌─────────────┐ ┌──────────────┐ │
|
|
||||||
│ │ Operator's │ │ Reverse │ │
|
|
||||||
│ │ browser │◄──►│ proxy │ │ TLS terminates here
|
|
||||||
│ └─────────────┘ └──────┬───────┘ │
|
|
||||||
└────────────────────────────┼─────────────┘
|
|
||||||
│ HTTP, plaintext
|
|
||||||
│ (loopback or trusted LAN)
|
|
||||||
┌────────────────────────────▼─────────────┐
|
|
||||||
│ Server (control plane) │
|
|
||||||
└────────────┬─────────────────────────────┘
|
|
||||||
│ outbound WebSocket (TLS to clients via proxy)
|
|
||||||
│ — bearer-authenticated
|
|
||||||
┌────────────▼──────────────┐
|
|
||||||
│ Agent (per host) │ ◄── attacker model: assume one
|
|
||||||
└────────────┬──────────────┘ endpoint can be compromised
|
|
||||||
│ subprocess
|
|
||||||
▼
|
|
||||||
restic ──▶ repository (rest-server / S3 / SFTP / …)
|
|
||||||
```
|
|
||||||
|
|
||||||
## What we defend against
|
|
||||||
|
|
||||||
### Network attacker between operator and server
|
|
||||||
|
|
||||||
- HTTPS via the reverse proxy is the only operator-facing surface
|
|
||||||
on a sane deployment.
|
|
||||||
- `RM_COOKIE_SECURE=true` (default) means the session cookie
|
|
||||||
refuses to ride a non-HTTPS connection.
|
|
||||||
- `RM_TRUSTED_PROXY` gates whether `X-Forwarded-*` is honoured;
|
|
||||||
a bypassing request can't spoof the client IP.
|
|
||||||
|
|
||||||
### Compromised agent host
|
|
||||||
|
|
||||||
- The agent's bearer token can dispatch commands **only on its
|
|
||||||
own host**. It can't read other hosts' state, dispatch jobs
|
|
||||||
on other hosts, or escalate within the control plane.
|
|
||||||
- If you suspect a host compromise:
|
|
||||||
1. Disable the agent's host row from **Hosts → Delete**
|
|
||||||
(cascades the bearer hash).
|
|
||||||
2. Rotate the repo credential at the rest-server / object
|
|
||||||
store side.
|
|
||||||
3. Audit-log lists every action that bearer ever drove.
|
|
||||||
|
|
||||||
### DB compromise without the secret key
|
|
||||||
|
|
||||||
- Repo credentials are AEAD-encrypted at rest. A DB dump alone
|
|
||||||
doesn't expose them.
|
|
||||||
- Agent bearer **hashes** are leaked; that's enough to
|
|
||||||
authenticate as any agent until you revoke. A rotation
|
|
||||||
procedure is just "delete + re-enrol" today.
|
|
||||||
- Operator passwords are bcrypt-hashed; OIDC users have no
|
|
||||||
password to leak.
|
|
||||||
- Session tokens are hashed; an attacker can't replay a
|
|
||||||
session from a DB dump.
|
|
||||||
|
|
||||||
### DB compromise WITH the secret key
|
|
||||||
|
|
||||||
The attacker can decrypt every credential. Treat
|
|
||||||
`secret.key` with the same care as a password manager database.
|
|
||||||
Back it up to a separate vault, not to the same Docker volume
|
|
||||||
as the database.
|
|
||||||
|
|
||||||
### Forget/prune as a DoS vector
|
|
||||||
|
|
||||||
- The everyday backup credential cannot prune (append-only).
|
|
||||||
- The admin credential is only pushed to the agent at the
|
|
||||||
moment of dispatch and discarded after the job ends.
|
|
||||||
- Compromise of a single agent host does **not** grant prune
|
|
||||||
rights — at worst the attacker gets fresh write access until
|
|
||||||
the credential is rotated.
|
|
||||||
|
|
||||||
### Operator-side typo or bad copy-paste
|
|
||||||
|
|
||||||
- Repo credentials are stored encrypted; mis-typed creds fail
|
|
||||||
fast on the next `restic` invocation rather than silently
|
|
||||||
corrupting state.
|
|
||||||
- NS-03 added auto-init: the first dispatched job after creds
|
|
||||||
change runs `restic init`, surfaces the error eagerly under
|
|
||||||
the host's vitals strip if the creds are bad, and resets the
|
|
||||||
host's `repo_status` so the operator can retry without
|
|
||||||
hunting through job logs.
|
|
||||||
|
|
||||||
## What we don't defend against
|
|
||||||
|
|
||||||
- **Insider threat at the maintainer level.** A malicious
|
|
||||||
maintainer can publish a backdoored container; SBOM /
|
|
||||||
signing infrastructure (Phase 6 candidate) would help here
|
|
||||||
but isn't shipped today.
|
|
||||||
- **Supply chain.** We pin module versions (`go.sum`) and
|
|
||||||
pin the Tailwind binary's release tag, but a compromise in
|
|
||||||
one of those upstreams would land here.
|
|
||||||
- **Side-channel via restic itself.** A bug in restic that
|
|
||||||
enables snapshot-content disclosure is restic's problem; the
|
|
||||||
control plane doesn't see snapshot bytes either way.
|
|
||||||
- **DoS via resource exhaustion** without the recommended
|
|
||||||
reverse-proxy / rate-limit in front. Don't expose the
|
|
||||||
server's HTTP port to the public internet directly.
|
|
||||||
-120
@@ -1,120 +0,0 @@
|
|||||||
# End-to-end test harness
|
|
||||||
|
|
||||||
The e2e harness stands up the full production-shaped stack
|
|
||||||
(server + agent + rest-server) in Docker Compose and drives it
|
|
||||||
through Playwright. CI runs it on every PR; operators can run it
|
|
||||||
locally too.
|
|
||||||
|
|
||||||
## Files
|
|
||||||
|
|
||||||
```
|
|
||||||
e2e/
|
|
||||||
├── compose.e2e.yml compose stack: server + rest-server + agent
|
|
||||||
├── Dockerfile.agent Linux container for the agent (alpine + restic)
|
|
||||||
├── agent-entrypoint.sh decides between announce / token-enrol / run
|
|
||||||
└── playwright/
|
|
||||||
├── package.json
|
|
||||||
├── playwright.config.ts
|
|
||||||
└── tests/
|
|
||||||
├── lib/server.ts bootstrap, login, accept, poll helpers
|
|
||||||
└── smoke.spec.ts happy-path: enrol → backup → succeeded
|
|
||||||
```
|
|
||||||
|
|
||||||
## Local run
|
|
||||||
|
|
||||||
Prerequisites: Docker + Docker Compose, and `npx` for Playwright.
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# 1. Build + bring up the stack (server, rest-server, source data).
|
|
||||||
docker compose -f e2e/compose.e2e.yml up --build -d server rest-server source-fixture
|
|
||||||
|
|
||||||
# 2. Wait for the server, then scrape the bootstrap token from the log.
|
|
||||||
until curl -fsS http://127.0.0.1:8080/api/version >/dev/null; do sleep 1; done
|
|
||||||
RM_BOOTSTRAP_TOKEN=$(docker compose -f e2e/compose.e2e.yml logs server \
|
|
||||||
| grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1)
|
|
||||||
export RM_BOOTSTRAP_TOKEN
|
|
||||||
|
|
||||||
# 3. Start the agent (it announces against the running server).
|
|
||||||
docker compose -f e2e/compose.e2e.yml up -d agent
|
|
||||||
|
|
||||||
# 4. Install + run Playwright.
|
|
||||||
cd e2e/playwright
|
|
||||||
npm install
|
|
||||||
npx playwright install --with-deps chromium
|
|
||||||
npx playwright test
|
|
||||||
```
|
|
||||||
|
|
||||||
When the test passes you'll see:
|
|
||||||
|
|
||||||
```
|
|
||||||
Running 2 tests using 1 worker
|
|
||||||
✓ smoke: enrol-via-announce → backup › happy path completes in under a minute (47s)
|
|
||||||
✓ smoke: scrape /metrics › metrics endpoint exposes the host gauge (180ms)
|
|
||||||
|
|
||||||
2 passed (47.5s)
|
|
||||||
```
|
|
||||||
|
|
||||||
Tear-down:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
docker compose -f e2e/compose.e2e.yml down -v
|
|
||||||
```
|
|
||||||
|
|
||||||
`-v` removes the named volumes too — important between runs because
|
|
||||||
the rest-server volume holds an initialised repo and the
|
|
||||||
agent-config volume holds a stale bearer.
|
|
||||||
|
|
||||||
## What the test exercises
|
|
||||||
|
|
||||||
1. **Bootstrap.** Posts the admin-creation request to
|
|
||||||
`/api/bootstrap` with the token scraped from the server log.
|
|
||||||
2. **Login (UI).** Drives the login form via Playwright; verifies
|
|
||||||
the dashboard loads with a session cookie set.
|
|
||||||
3. **Pending host appears.** Polls the dashboard for the inline
|
|
||||||
accept form generated by the announcing agent; reads the
|
|
||||||
pending-id out of its action URL.
|
|
||||||
4. **Accept.** POSTs `/api/pending-hosts/{id}/accept` with the
|
|
||||||
rest-server URL + repo password. The server mints a Host row
|
|
||||||
+ bearer + AEAD-encrypted creds and pushes the bearer down
|
|
||||||
the still-open pending WebSocket.
|
|
||||||
5. **Online + auto-init.** Polls `/api/hosts` until the new host
|
|
||||||
is `status=online`. Auto-init runs as part of this — the
|
|
||||||
first dispatched job after creds save is `restic init`.
|
|
||||||
6. **Run backup.** Submits the host detail page's `Run now`
|
|
||||||
form; expects `HX-Redirect` to the live job page.
|
|
||||||
7. **Verify.** Polls `/api/hosts` until the host's
|
|
||||||
`last_backup_status` flips to `succeeded`.
|
|
||||||
8. **Metrics.** Scrapes `/metrics` and asserts the
|
|
||||||
server-gauge + build-info lines are present (the compose
|
|
||||||
stack opens the endpoint via `RM_METRICS_TRUSTED_CIDR=0.0.0.0/0`).
|
|
||||||
|
|
||||||
## CI workflow
|
|
||||||
|
|
||||||
[`.gitea/workflows/e2e.yml`](../.gitea/workflows/e2e.yml) runs the
|
|
||||||
suite on every PR into `main`. On failure it dumps the last 200
|
|
||||||
lines of each container log as a workflow annotation and uploads
|
|
||||||
the Playwright HTML report as an artefact.
|
|
||||||
|
|
||||||
## When tests fail
|
|
||||||
|
|
||||||
- **Pending host never appears.** Agent container probably
|
|
||||||
couldn't reach the server. Check `docker compose logs agent`
|
|
||||||
for connection errors and `docker compose logs server` for
|
|
||||||
any 4xx on `/api/agents/announce`.
|
|
||||||
- **Backup hangs in `running`.** The agent shells out to
|
|
||||||
`restic`; check the live job log at
|
|
||||||
`http://127.0.0.1:8080/jobs/<id>` (still up after a
|
|
||||||
failed test as long as you didn't `down -v`).
|
|
||||||
- **`RM_BOOTSTRAP_TOKEN not set`.** The server log scrape
|
|
||||||
matched the wrong line or the token regex is too tight. The
|
|
||||||
server prints the token on a line starting with ` ` (four
|
|
||||||
spaces) inside a banner; widen the regex if your server log
|
|
||||||
format changes.
|
|
||||||
|
|
||||||
## Adding new tests
|
|
||||||
|
|
||||||
The harness is intentionally flat — one `*.spec.ts` per
|
|
||||||
scenario. Reuse the helpers in `lib/server.ts` and avoid
|
|
||||||
duplicating bootstrap / login boilerplate. Heavy fixtures
|
|
||||||
(custom users, OIDC IdP) belong in their own compose override
|
|
||||||
file rather than complicating `compose.e2e.yml`.
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 27 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 98 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 178 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 48 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 92 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 47 KiB |
File diff suppressed because it is too large
Load Diff
@@ -1,259 +0,0 @@
|
|||||||
# P2 Completion Implementation Plan
|
|
||||||
|
|
||||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
|
||||||
|
|
||||||
**Goal:** Close every remaining P2 task in `tasks.md`: P2R-09 (auto-init UX), P2R-10/11/12 (hooks), P2R-13 (bandwidth wiring + per-job override), P2R-14 (schedule next/last run), P2-16 (Windows svc), P2-17 (`install.ps1`), P2-18 (announce-and-approve).
|
|
||||||
|
|
||||||
**Architecture:** Server stays HTTP+WS; agent stays a single binary that auto-restages via `make build`. Hooks live on `source_groups` (and host-level defaults). Announce-and-approve adds a separate WS path (`/ws/agent/pending`) and a Pending hosts panel; token-flow stays default. Windows service support uses `golang.org/x/sys/windows/svc` behind a `//go:build windows` tag — Linux builds untouched. **Operator is away — make best guesses on small UX choices, but commit each item separately so the choices are reviewable.**
|
|
||||||
|
|
||||||
**Tech Stack:** Go 1.23+, chi router, modernc/sqlite, `coder/websocket`, `robfig/cron/v3`, HTMX + Tailwind, `golang.org/x/sys/windows/svc`, Ed25519 (stdlib).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Pre-flight
|
|
||||||
|
|
||||||
- [ ] **Run baseline:** `go vet ./... && go build ./... && go test ./...` — must be green before starting. Restage agent + restart server (per CLAUDE.md restage block) so smoke env is warm.
|
|
||||||
|
|
||||||
## Order of execution
|
|
||||||
|
|
||||||
Smallest blast-radius first. UI polish → bandwidth → next/last → hooks → announce → Windows. Commit and restage at each task boundary. Run `go vet ./... && go test ./...` before every commit.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Task 1 — P2R-13a: Wire bandwidth caps into restic invocations
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `internal/restic/runner.go` (add `LimitUploadKBps`, `LimitDownloadKBps` to `Env` or to a per-call options struct already present; emit `--limit-upload N`/`--limit-download N` on `restic backup|forget|prune|check|restore`)
|
|
||||||
- Modify: `internal/agent/runner/*.go` — pass host-wide caps into the runner. Caps come from `agent.config.Config` or are pushed via `config.update`. Decision: ship caps in the existing `config.update` envelope as new fields `bandwidth_up_kbps`, `bandwidth_down_kbps`. Server pushes on hello + on `PUT /api/hosts/{id}/bandwidth`.
|
|
||||||
- Modify: `internal/api/messages.go` — extend `ConfigUpdatePayload` with the two int pointers.
|
|
||||||
- Modify: `internal/server/ws/handler.go` (or wherever hello/config push lives) — include caps in the pushed config.
|
|
||||||
- Modify: `internal/server/http/host_bandwidth.go` — after `SetHostBandwidth`, fan out a `config.update` to the connected agent (mirror the credentials-edit path).
|
|
||||||
- Test: `internal/restic/runner_test.go` — assert flag injection.
|
|
||||||
- Test: `internal/server/ws/*_test.go` — assert config.update carries caps on hello and on edit.
|
|
||||||
|
|
||||||
- [ ] **Step 1.1** Add `LimitUploadKBps *int`, `LimitDownloadKBps *int` to whatever per-host config the runner already consults. Existing pattern is `restic.Env{}`; extend it.
|
|
||||||
- [ ] **Step 1.2** Failing test in `internal/restic/runner_test.go`: build a backup command with `LimitUploadKBps=1024`, assert the resulting argv contains `--limit-upload 1024`.
|
|
||||||
- [ ] **Step 1.3** Implement: prepend the flags in argv builders for `backup`, `forget`, `prune`, `check`, `restore`. Skip when nil/<=0.
|
|
||||||
- [ ] **Step 1.4** Wire `config.update` payload — server reads `Host.BandwidthUpKBps`/`DownKBps`, includes them in the existing `ConfigUpdatePayload` push on hello and on bandwidth edit (mirror cred-edit fan-out in `internal/server/http/host_credentials.go`).
|
|
||||||
- [ ] **Step 1.5** Agent applies caps: store in the in-memory dispatcher state on `config.update`, attach to every restic call.
|
|
||||||
- [ ] **Step 1.6** `go vet ./... && go test ./... && make build && <restage block>`. Commit:
|
|
||||||
```
|
|
||||||
agent+server: apply host bandwidth caps to restic invocations
|
|
||||||
```
|
|
||||||
|
|
||||||
## Task 2 — P2R-13b: Per-job override on Run-now confirm dialog
|
|
||||||
|
|
||||||
**Decision:** A small numeric input on the per-source-group Run-now button (and dashboard Run-all). Operator is away — keep it minimal: two optional inputs (up/down KB/s) on the dispatch endpoint; UI shows a `<details>` "Limit bandwidth for this run" disclosure with two number inputs.
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `internal/server/http/sources.go` (or wherever the per-group Run-now POST lives) — accept optional `bandwidth_up_kbps`/`bandwidth_down_kbps` form fields, pass through.
|
|
||||||
- Modify: dispatch path (`internal/server/dispatch_*.go` or `ws/handler.go` job-dispatch core) — accept overrides, include in the `command.run` payload.
|
|
||||||
- Modify: `internal/api/messages.go` — `CommandRunPayload` gains optional caps that take precedence over host-wide caps when present.
|
|
||||||
- Modify: agent dispatcher — use payload override if present else falls back to config caps.
|
|
||||||
- Modify: `web/templates/pages/host_sources.html` (and the schedules Run-now form) — `<details>` block.
|
|
||||||
- Test: HTTP test for the new form fields; agent runner test for override precedence.
|
|
||||||
|
|
||||||
- [ ] **Step 2.1** Failing test: POST to per-group Run-now with `bandwidth_up_kbps=512` → assert dispatched payload carries 512.
|
|
||||||
- [ ] **Step 2.2** Implement endpoint changes + payload extension.
|
|
||||||
- [ ] **Step 2.3** Agent override precedence test (payload wins over config).
|
|
||||||
- [ ] **Step 2.4** UI `<details>` blocks (one per Run-now form).
|
|
||||||
- [ ] **Step 2.5** Playwright spot-check via `:8080` smoke env: open Sources tab, expand the Run-now disclosure, fire with limit=128, then open the live job log and confirm the agent's restic argv (read `/tmp/rm-smoke/server.log` for the dispatched command — it logs argv) shows `--limit-upload 128`.
|
|
||||||
- [ ] **Step 2.6** Commit.
|
|
||||||
|
|
||||||
## Task 3 — P2R-14: Schedule "next run" / "last run"
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `internal/store/schedules.go` — add `NextRunAt(time.Time)` derivation helper and `LatestScheduledJobAt(host_id, schedule_id) (time.Time, error)` (or a single batched fetch for all schedules of a host).
|
|
||||||
- Modify: dashboard host row (`web/templates/partials/host_row.html`) — show "Next: …" and "Last: …" when there's a single covering schedule (already detected in slice 5).
|
|
||||||
- Modify: `web/templates/pages/host_schedules.html` — add Next/Last columns to the schedules table.
|
|
||||||
- Modify: relevant page handlers (`internal/server/http/ui_schedules.go`, dashboard handler) — populate the data.
|
|
||||||
- Test: `schedules_test.go` for next-run derivation (parse cron, compute next from a fixed `now`).
|
|
||||||
|
|
||||||
- [ ] **Step 3.1** Add `NextRun(cronExpr string, from time.Time) (time.Time, error)` helper using `robfig/cron/v3`'s `Parse(...).Next(from)`. Test with three crons.
|
|
||||||
- [ ] **Step 3.2** Add `LatestJobByActorKindForSchedule(host_id, schedule_id) (time.Time, status, error)` query against `jobs` (filter `actor_kind='schedule'` AND `schedule_id=?`, ORDER BY `started_at` DESC LIMIT 1).
|
|
||||||
- [ ] **Step 3.3** Wire schedules-page handler to populate Next/Last per row; render relative time + ISO tooltip (mirror existing `formatRelTime` template helper if it exists; otherwise use a simple "5m ago" helper).
|
|
||||||
- [ ] **Step 3.4** Wire dashboard row: when single covering schedule, surface "Next: 03:00" / "Last: 8h ago — succeeded".
|
|
||||||
- [ ] **Step 3.5** Playwright spot-check: a host with a schedule shows Next/Last; pause it → Next becomes "—" / "(paused)".
|
|
||||||
- [ ] **Step 3.6** Commit.
|
|
||||||
|
|
||||||
## Task 4 — P2R-09: Auto-init UX polish
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `web/templates/pages/host_repo.html` — danger-zone re-init button + two-step confirm (type the host name).
|
|
||||||
- Modify: `internal/server/http/ui_repo.go` (or new `repo_reinit.go`) — `POST /hosts/{id}/repo/reinit` admin-only, audit-logged. Server runs `restic init --force` (or wipes-then-inits — pick the safer of the two; restic doesn't truly wipe a repo, the operator must clear the bucket. **Best guess:** dispatch a normal `init` job with a flag that re-runs even if the repo claims to exist; if restic refuses, surface "the repo on the remote already has data — clear it manually before re-init" via the job log).
|
|
||||||
- Modify: host detail page header / vitals strip — surface init result line. Use the existing latest-`init`-job query to render "repo ready · initialised <relative time> ago" or "init failed · job N · retry".
|
|
||||||
- Test: HTTP test for re-init endpoint (auth, audit, host-name confirm); template test that the result line renders for both states.
|
|
||||||
|
|
||||||
- [ ] **Step 4.1** Add helper: `LatestJobByKind(host_id, "init")` — already exists from P2R-06 (`store.LatestJobByKind`). Reuse.
|
|
||||||
- [ ] **Step 4.2** Render init line into vitals strip; show "init failed" amber when latest init failed.
|
|
||||||
- [ ] **Step 4.3** Implement `POST /hosts/{id}/repo/reinit` handler — admin role check, requires a `confirm_hostname` form field that must equal `host.Name`, returns 400 otherwise. Dispatches a fresh `init` job.
|
|
||||||
- [ ] **Step 4.4** Add danger-zone re-init form to `host_repo.html` (currently disabled per slice 4). Two-step confirm with the typed hostname.
|
|
||||||
- [ ] **Step 4.5** Playwright: visit `/hosts/{id}/repo`, click re-init, type wrong hostname → blocked; type right hostname → dispatches init job → returns to live log.
|
|
||||||
- [ ] **Step 4.6** Commit.
|
|
||||||
|
|
||||||
## Task 5 — P2R-10: Hook schema (migration 0010)
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Create: `internal/store/migrations/0010_hooks.sql`
|
|
||||||
- `ALTER TABLE source_groups ADD COLUMN pre_hook BLOB;` (AEAD ciphertext, NULLable)
|
|
||||||
- `ALTER TABLE source_groups ADD COLUMN post_hook BLOB;`
|
|
||||||
- `ALTER TABLE hosts ADD COLUMN pre_hook_default BLOB;`
|
|
||||||
- `ALTER TABLE hosts ADD COLUMN post_hook_default BLOB;`
|
|
||||||
- All four are AEAD ciphertext (existing `crypto.AEAD`); BLOB column type.
|
|
||||||
- Modify: `internal/store/types.go` — add `PreHook *string` (decrypted), `PostHook *string` to `SourceGroup`; same to `Host`.
|
|
||||||
- Modify: `internal/store/sources.go` + `internal/store/hosts.go` — getters/setters encrypt on write, decrypt on read. Pass `crypto.AEAD` through (pattern mirrors `host_credentials.go`).
|
|
||||||
- Test: encrypt/decrypt round-trip; setting `nil` clears the column.
|
|
||||||
|
|
||||||
- [ ] **Step 5.1** Write migration SQL. Column-level ALTERs only (per CLAUDE.md).
|
|
||||||
- [ ] **Step 5.2** Update store types + getters/setters with AEAD encrypt/decrypt. Mirror `internal/store/host_credentials.go` patterns exactly.
|
|
||||||
- [ ] **Step 5.3** Round-trip test: set hook on a source group; reload; assert plaintext returned. Set nil; assert nil after reload.
|
|
||||||
- [ ] **Step 5.4** `go vet && go test`. Commit.
|
|
||||||
|
|
||||||
## Task 6 — P2R-11: Agent execution of hooks
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `internal/api/messages.go` — `ConfigUpdatePayload` (or the per-source-group bundle inside `ScheduleSetPayload`) carries `PreHook`, `PostHook` plaintext (server has decrypted by then; wire is authenticated WS, same trust boundary as repo creds).
|
|
||||||
- Modify: agent dispatcher — for `kind=backup` only:
|
|
||||||
- Run `pre_hook` (if present) via `os/exec` with the host shell (`/bin/sh -c` on Linux, `cmd.exe /C` on Windows). Capture stdout+stderr → JobLog with `hook:` prefix. Non-zero exit aborts the backup, marks the job failed with `pre_hook` error.
|
|
||||||
- Run `post_hook` (if present) **always** after the backup, with `RM_JOB_STATUS=succeeded|failed` env var. Capture into JobLog, prefix `hook:`. Non-zero exit on post_hook does NOT change job status (warning logged).
|
|
||||||
- Skip both for `kind` ∈ {forget, prune, check, unlock, init} per spec.md §14.3.
|
|
||||||
- Test: dispatcher test with a `pre_hook` that exits 1 → backup not started; `post_hook` always runs and sees `RM_JOB_STATUS`.
|
|
||||||
|
|
||||||
- [ ] **Step 6.1** Plumb hooks through `ScheduleSetPayload` source-group bundle + per-group Run-now `command.run` payload (override host-default with group hook if both present). Server-side resolution: host default if group hook is empty.
|
|
||||||
- [ ] **Step 6.2** Agent dispatcher: factor hook execution into `internal/agent/runner/hooks.go`. Use `exec.CommandContext`, set env, plumb output to existing JobLog stream with `Source: "hook"` (or prefix the log lines `hook: …`).
|
|
||||||
- [ ] **Step 6.3** Failing test in `internal/agent/runner/runner_test.go` (create file if absent): `pre_hook=/bin/false` → job fails with `pre_hook failed (exit 1)` and the actual restic backup never runs (assert via mock-restic shim).
|
|
||||||
- [ ] **Step 6.4** Test: `post_hook` runs even when backup fails; receives `RM_JOB_STATUS=failed`.
|
|
||||||
- [ ] **Step 6.5** Test: hooks skipped on `forget`/`prune`/`check`/`unlock` jobs.
|
|
||||||
- [ ] **Step 6.6** `go vet && go test && make build && <restage block>`. Commit.
|
|
||||||
|
|
||||||
## Task 7 — P2R-12: Hook editor UI
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `web/templates/pages/source_group_edit.html` (new or extend existing source-group form) — `<textarea>` for pre_hook, `<textarea>` for post_hook, with the warning banner: "this hook runs as the agent service user (root on Linux; LocalSystem on Windows)".
|
|
||||||
- Modify: source-group HTTP handler (`internal/server/http/sources.go`) — accept hook fields on POST/PUT, encrypt-and-persist via store.
|
|
||||||
- Create: a new "Settings" tab section on host detail (currently inert per P1-25) — wait, just add a new sub-tab or extend Repo page. **Decision:** add `pre_hook_default` / `post_hook_default` to the Repo page under a new "Hooks" section since Settings is still inert.
|
|
||||||
- Modify: source-group form admin-only check; post-only edit allowed by operators? **Decision:** admin-only edit per spec; render but disable for operators.
|
|
||||||
- Modify: audit-log writer — emit `source_group.hook_updated` and `host.default_hook_updated` events (without the hook body).
|
|
||||||
- Test: HTTP test for create + update; admin-only enforcement; audit row written without secret.
|
|
||||||
|
|
||||||
- [ ] **Step 7.1** Source-group form extension + handler wiring.
|
|
||||||
- [ ] **Step 7.2** Repo page Hooks section (host defaults).
|
|
||||||
- [ ] **Step 7.3** Audit entries.
|
|
||||||
- [ ] **Step 7.4** Playwright: as admin, set a `pre_hook` of `echo hello`, fire Run-now, open live log, confirm `hook: hello` line appears.
|
|
||||||
- [ ] **Step 7.5** Commit.
|
|
||||||
|
|
||||||
## Task 8 — P2-18a: Announce schema + endpoint
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Create: `internal/store/migrations/0011_pending_hosts.sql`
|
|
||||||
```sql
|
|
||||||
CREATE TABLE pending_hosts (
|
|
||||||
id TEXT PRIMARY KEY,
|
|
||||||
hostname TEXT NOT NULL,
|
|
||||||
os TEXT NOT NULL,
|
|
||||||
arch TEXT NOT NULL,
|
|
||||||
agent_version TEXT NOT NULL,
|
|
||||||
restic_version TEXT NOT NULL,
|
|
||||||
public_key BLOB NOT NULL, -- 32-byte Ed25519
|
|
||||||
fingerprint TEXT NOT NULL, -- "SHA256:hex"
|
|
||||||
announced_from_ip TEXT NOT NULL,
|
|
||||||
first_seen_at TEXT NOT NULL,
|
|
||||||
last_seen_at TEXT NOT NULL,
|
|
||||||
expires_at TEXT NOT NULL
|
|
||||||
);
|
|
||||||
CREATE INDEX pending_hosts_expires ON pending_hosts(expires_at);
|
|
||||||
CREATE INDEX pending_hosts_fingerprint ON pending_hosts(fingerprint);
|
|
||||||
```
|
|
||||||
- Create: `internal/store/pending_hosts.go` — `CreatePendingHost`, `GetPendingHostByFingerprint`, `ListPendingHosts`, `DeletePendingHost`, `TouchPendingHost`, `DeleteExpiredPendingHosts`.
|
|
||||||
- Create: `internal/server/http/announce.go` — `POST /api/agents/announce` accepts `{hostname, os, arch, agent_version, restic_version, public_key (base64)}`. Validates protocol_version implicitly via `agent_version` check. Token-bucket rate limit per source IP (10/min). Global cap 100 pending rows. Returns `{fingerprint, pending_id, hostname_collision: bool}`.
|
|
||||||
- Test: `announce_test.go` — happy path; rate limit; cap; collision flag.
|
|
||||||
|
|
||||||
- [ ] **Step 8.1** Migration + store layer + tests.
|
|
||||||
- [ ] **Step 8.2** Endpoint + tests (use a fake clock + in-process token bucket).
|
|
||||||
- [ ] **Step 8.3** Commit.
|
|
||||||
|
|
||||||
## Task 9 — P2-18b: Pending WS + accept/reject
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Create: `internal/server/ws/pending.go` — `GET /ws/agent/pending` upgrade. Server issues a 32-byte nonce; agent signs it with its Ed25519 private key; server verifies against the `public_key` stored on the pending row keyed by the supplied `pending_id`. If valid, hold the connection open; on accept, push a single `enrolled` message containing `{bearer_token, repo_credentials_aead_blob}` and close cleanly. On reject, close with code 4001 + reason "rejected".
|
|
||||||
- Create: `internal/server/http/pending.go` — admin-only `POST /api/pending-hosts/{id}/accept` (atomically: mint bearer, decrypt admin-supplied repo creds (passed in form), promote pending row → real `hosts` row, push `enrolled` to the open WS, audit-log) and `POST /api/pending-hosts/{id}/reject` (delete row + close socket).
|
|
||||||
- Modify: server `main.go` route registration.
|
|
||||||
- Test: integration test — fake agent opens pending WS, admin POST /accept, agent receives bearer.
|
|
||||||
|
|
||||||
- [ ] **Step 9.1** Pending WS handler with nonce-sign verify.
|
|
||||||
- [ ] **Step 9.2** Accept/reject endpoints. Accept reuses the existing token-consume path internally (mints persistent bearer from `crypto.RandomToken`-style helper, inserts host row + `host_credentials`).
|
|
||||||
- [ ] **Step 9.3** Tests.
|
|
||||||
- [ ] **Step 9.4** Commit.
|
|
||||||
|
|
||||||
## Task 10 — P2-18c: Agent announce path
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `cmd/agent/main.go` — when `RM_TOKEN` is unset, switch to announce mode instead of erroring out. `RM_SERVER` still required.
|
|
||||||
- Create: `internal/agent/announce/announce.go` — generate-or-load Ed25519 keypair (persisted as a file alongside `secrets.enc`, mode 0600). POST `/api/agents/announce`. Open `/ws/agent/pending`. Wait. On `enrolled` message, persist bearer to `agent.yaml`, persist repo creds via existing secrets store, exit announce mode and reconnect via the normal WS path.
|
|
||||||
- Modify: `deploy/install/install.sh` — when `RM_TOKEN` is missing, run agent in announce mode and `journalctl --follow` until the agent prints the fingerprint, print it to the operator's terminal in big copy-friendly format, then keep following until enrolled.
|
|
||||||
- Test: end-to-end test in `internal/server/...` using a fake agent.
|
|
||||||
|
|
||||||
- [ ] **Step 10.1** Keypair generation + persistence.
|
|
||||||
- [ ] **Step 10.2** Announce client + pending WS client; print `SHA256:…` fingerprint to stdout in a banner.
|
|
||||||
- [ ] **Step 10.3** Install script branch.
|
|
||||||
- [ ] **Step 10.4** Playwright: register a host via announce mode (run agent locally with no RM_TOKEN), log into UI, see Pending hosts panel with the fingerprint, click Accept, confirm host appears.
|
|
||||||
- [ ] **Step 10.5** Commit.
|
|
||||||
|
|
||||||
## Task 11 — P2-18d: Pending hosts UI panel
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `web/templates/pages/dashboard.html` — add Pending hosts panel above the host list when any pending rows exist.
|
|
||||||
- Modify: dashboard handler — `Store.ListPendingHosts(now)` (auto-skips expired).
|
|
||||||
- Add buttons → POST `/api/pending-hosts/{id}/accept` and `/reject` via HTMX.
|
|
||||||
- Background sweeper for `DeleteExpiredPendingHosts` every 60s (mirror the existing offline-sweeper goroutine pattern).
|
|
||||||
|
|
||||||
- [ ] **Step 11.1** Sweeper goroutine.
|
|
||||||
- [ ] **Step 11.2** Dashboard handler + template.
|
|
||||||
- [ ] **Step 11.3** Accept form must include the same repo URL/user/pw fields as the token-mint form (admin still supplies repo creds at accept time).
|
|
||||||
- [ ] **Step 11.4** Playwright sweep.
|
|
||||||
- [ ] **Step 11.5** Commit.
|
|
||||||
|
|
||||||
## Task 12 — P2-16: Windows service integration
|
|
||||||
|
|
||||||
**Decision:** Cannot test on Windows from WSL. Goal is a clean compile under `GOOS=windows GOARCH=amd64` and code that follows the canonical `golang.org/x/sys/windows/svc/example` pattern. Untestable beyond compile + manual review; mark in commit message.
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Create: `internal/agent/service/service_windows.go` (build tag `//go:build windows`) — implements `svc.Handler`. `Execute` starts the agent's main loop in a goroutine, listens for `svc.Stop`/`svc.Shutdown`, cancels ctx, waits.
|
|
||||||
- Create: `internal/agent/service/service_other.go` (build tag `//go:build !windows`) — stub `RunService` that just runs the agent loop in the foreground.
|
|
||||||
- Create: `internal/agent/service/install_windows.go` — `Install`, `Uninstall`, `Start`, `Stop` thin wrappers around `mgr` package.
|
|
||||||
- Modify: `cmd/agent/main.go` — sub-commands: `install`, `uninstall`, `start`, `stop`, `run` (default). `run` delegates to `service.Run()` which on Windows checks `svc.IsWindowsService()` and dispatches accordingly.
|
|
||||||
- Test: `internal/agent/service/service_windows_test.go` (build-tagged) for argv parsing only — actual SCM interaction can't be tested in CI.
|
|
||||||
|
|
||||||
- [ ] **Step 12.1** Implement the svc.Handler shell.
|
|
||||||
- [ ] **Step 12.2** Install/uninstall wrappers (use `mgr.ConnectLocal()`, `m.CreateService(name, exepath, mgr.Config{...}, "run")`).
|
|
||||||
- [ ] **Step 12.3** Cross-compile check: `GOOS=windows GOARCH=amd64 go build ./cmd/agent` must succeed.
|
|
||||||
- [ ] **Step 12.4** Commit with note "untested on Windows; compile-verified only".
|
|
||||||
|
|
||||||
## Task 13 — P2-17: install.ps1
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Create: `deploy/install/install.ps1` — PowerShell 5.1+ compatible. Checks admin elevation. Downloads agent binary from `$RM_SERVER/agent/binary?os=windows&arch=amd64`. Drops it at `C:\Program Files\restic-manager\restic-manager-agent.exe`. Runs `restic-manager-agent.exe install` (registers service). Starts it. Detects existing tasks named `*restic*` via `Get-ScheduledTask` and prints them — does not auto-disable. Writes `C:\ProgramData\restic-manager\agent.yaml` with `RM_SERVER` + `RM_TOKEN` (or no token if announce-mode).
|
|
||||||
- Modify: `internal/server/http/install.go` (or wherever install scripts are served) to also serve `/install/install.ps1`.
|
|
||||||
- Modify: CLAUDE.md restage block to also stage `install.ps1`.
|
|
||||||
|
|
||||||
- [ ] **Step 13.1** Write the script.
|
|
||||||
- [ ] **Step 13.2** Wire serving + restage.
|
|
||||||
- [ ] **Step 13.3** Smoke parse: `pwsh -NoProfile -Command "Get-Command -Syntax (Get-ChildItem deploy/install/install.ps1)"` if pwsh is on PATH, else `Set-StrictMode` parse via `pwsh -c "$null = [scriptblock]::Create((Get-Content deploy/install/install.ps1 -Raw))"`. Skip if no pwsh available — note in commit.
|
|
||||||
- [ ] **Step 13.4** Commit.
|
|
||||||
|
|
||||||
## Task 14 — Final integration sweep
|
|
||||||
|
|
||||||
- [ ] **Step 14.1** `go vet ./... && go test ./... -race`. Full build. Restage. Restart server.
|
|
||||||
- [ ] **Step 14.2** Playwright walkthrough on `:8080`: login → dashboard shows pending-hosts empty state → create source group → set a `pre_hook` → Run-now with bandwidth override → confirm hook fires + bandwidth applied → schedules tab shows next/last → repo page shows init-OK line → re-init flow gated by typed hostname.
|
|
||||||
- [ ] **Step 14.3** Update `tasks.md`: tick P2R-09, P2R-10, P2R-11, P2R-12, P2R-13, P2R-14, P2-16, P2-17, P2-18 done. Update Phase 2 acceptance line items as satisfied.
|
|
||||||
- [ ] **Step 14.4** Open PR `p2-completion → main` with a summary of every item closed.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Decisions made on the operator's behalf (away)
|
|
||||||
|
|
||||||
1. **Bandwidth UI for per-job override:** small `<details>` disclosure under each Run-now button. Simpler than a modal; matches the rest of the app's progressive-disclosure style.
|
|
||||||
2. **Re-init UX:** server dispatches a fresh `init` job; if restic refuses because the repo already exists, surfaces the error in the job log and instructs the operator to clear the remote bucket. We don't try to forcibly wipe — too dangerous, and the agent doesn't have credentials to wipe S3/B2/etc generically.
|
|
||||||
3. **Hooks editor lives on the Repo page (host defaults) + on the source-group edit form (per-group override).** Skips inventing a new "Settings" tab since that surface is still inert.
|
|
||||||
4. **Announce flow:** admin still supplies repo creds at accept time (same form as the token-mint flow). The pending row only carries identity-of-the-endpoint material, never repo creds.
|
|
||||||
5. **Windows service:** compile-verified only; untested. Commit message will say so.
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,131 +0,0 @@
|
|||||||
# P5-03 implementation plan — Docker-only release
|
|
||||||
|
|
||||||
Spec: `docs/superpowers/specs/2026-05-05-p5-03-docker-only-release.md`.
|
|
||||||
|
|
||||||
Branch: `p5-03-docker-release`. Do not auto-open a PR (see CLAUDE.md
|
|
||||||
memory: CI runs are expensive on the self-hosted cluster).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Slice 1 — Server config + handler fallback
|
|
||||||
|
|
||||||
**Goal:** server can serve agent binaries / install scripts from a
|
|
||||||
read-only "bundled assets" path when `<DataDir>` doesn't have them.
|
|
||||||
|
|
||||||
1. `internal/server/config/config.go` (or wherever `Cfg` lives) gains
|
|
||||||
a `BundledAssetsDir string` field, defaulting to
|
|
||||||
`/opt/restic-manager/dist`. Wire from `RM_BUNDLED_ASSETS_DIR` env
|
|
||||||
var, mirroring the existing env-var conventions.
|
|
||||||
2. `internal/server/http/agent_assets.go`:
|
|
||||||
- `handleAgentBinary`: try `<DataDir>/agent-binaries/<name>`
|
|
||||||
first; on `os.Stat` ENOENT, try
|
|
||||||
`<BundledAssetsDir>/agent-binaries/<name>`; on second ENOENT,
|
|
||||||
existing 404.
|
|
||||||
- `handleInstallAsset`: same dual-path, with `install/` subpath.
|
|
||||||
3. Tests in `internal/server/http/agent_assets_test.go` (new file):
|
|
||||||
- DataDir hit serves DataDir bytes.
|
|
||||||
- DataDir miss + bundled hit serves bundled bytes.
|
|
||||||
- DataDir hit shadows bundled.
|
|
||||||
- Both miss → 404 + existing error envelope.
|
|
||||||
- Path-traversal still rejected for `install/*` (regression check).
|
|
||||||
|
|
||||||
**Verify:** `go vet ./...` + `go test ./internal/server/http/...`.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Slice 2 — Version ldflags on both binaries
|
|
||||||
|
|
||||||
1. `cmd/server/main.go`: keep `var version`, add
|
|
||||||
`var commit = "none"` and `var date = "unknown"`. Surface via
|
|
||||||
existing version-log line.
|
|
||||||
2. `cmd/agent/main.go`: same three vars. Agent already reports
|
|
||||||
`agent_version` in the WS hello — extend to include commit if
|
|
||||||
it's already plumbed through `internal/api`; otherwise leave the
|
|
||||||
commit out of the wire and just log it on startup.
|
|
||||||
3. `Makefile`: extend the `make build` `-ldflags` to set all three
|
|
||||||
from `git describe --tags --always` + `git rev-parse HEAD` +
|
|
||||||
UTC timestamp. Source-build users get real values, not "dev".
|
|
||||||
4. `deploy/Dockerfile.server`: add `ARG COMMIT=none` and
|
|
||||||
`ARG DATE=unknown`; pass through `-ldflags`.
|
|
||||||
|
|
||||||
**Verify:** `make build && ./bin/restic-manager-server -version`
|
|
||||||
(or whatever the existing flag is) prints non-`dev` values.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Slice 3 — Dockerfile bakes agents + install assets
|
|
||||||
|
|
||||||
1. Build stage cross-compiles three agents:
|
|
||||||
|
|
||||||
```dockerfile
|
|
||||||
RUN go build -trimpath -ldflags="-s -w \
|
|
||||||
-X main.version=${VERSION} -X main.commit=${COMMIT} -X main.date=${DATE}" \
|
|
||||||
-o /out/agent/restic-manager-agent-linux-amd64 ./cmd/agent
|
|
||||||
ENV GOARCH=arm64
|
|
||||||
RUN go build ... -o /out/agent/restic-manager-agent-linux-arm64 ./cmd/agent
|
|
||||||
ENV GOOS=windows GOARCH=amd64
|
|
||||||
RUN go build ... -o /out/agent/restic-manager-agent-windows-amd64.exe ./cmd/agent
|
|
||||||
```
|
|
||||||
|
|
||||||
(Reset `GOOS`/`GOARCH` between layers via `ENV`. Server build
|
|
||||||
stays at `GOOS=linux GOARCH=$TARGETARCH`.)
|
|
||||||
|
|
||||||
2. Final stage `COPY --from=build`:
|
|
||||||
- `/out/restic-manager-server` → `/usr/local/bin/`
|
|
||||||
- `/out/agent/*` → `/opt/restic-manager/dist/agent-binaries/`
|
|
||||||
- `deploy/install/install.sh` →
|
|
||||||
`/opt/restic-manager/dist/install/install.sh`
|
|
||||||
- `deploy/install/install.ps1` →
|
|
||||||
`/opt/restic-manager/dist/install/install.ps1`
|
|
||||||
- `deploy/install/restic-manager-agent.service` →
|
|
||||||
`/opt/restic-manager/dist/install/restic-manager-agent.service`
|
|
||||||
|
|
||||||
3. Set `--chmod=0755` on the agent binaries and `install.sh`,
|
|
||||||
`--chmod=0644` on the unit file and `install.ps1`. Distroless
|
|
||||||
final stage runs as `nonroot`; bundled assets are readable by
|
|
||||||
anyone (mode `o+r`), so the user switch doesn't break reads.
|
|
||||||
|
|
||||||
**Verify:**
|
|
||||||
```sh
|
|
||||||
docker build -f deploy/Dockerfile.server -t rm:dev .
|
|
||||||
docker run --rm -d -p 18080:8080 \
|
|
||||||
-e RM_LISTEN=:8080 -e RM_DATA_DIR=/data \
|
|
||||||
-e RM_BASE_URL=http://127.0.0.1:18080 \
|
|
||||||
-v rm-test:/data rm:dev
|
|
||||||
curl -fsSL "http://127.0.0.1:18080/agent/binary?os=linux&arch=amd64" | wc -c
|
|
||||||
curl -fsSL "http://127.0.0.1:18080/install/install.sh" | head -1
|
|
||||||
```
|
|
||||||
|
|
||||||
Both should succeed against a fresh volume (no operator staging).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Slice 4 — Release workflow
|
|
||||||
|
|
||||||
`.gitea/workflows/release.yml` per the spec. Two jobs:
|
|
||||||
|
|
||||||
1. **`image`**: checkout → setup-qemu → setup-buildx → login → compute
|
|
||||||
tags → buildx build+push.
|
|
||||||
2. (Future) `release-notes`: stub left as a TODO comment for now.
|
|
||||||
Operator can hand-write release notes via the Gitea UI on first
|
|
||||||
cut.
|
|
||||||
|
|
||||||
The `compute tags` shell step is the only non-trivial bit; tested
|
|
||||||
inline by running the script with mocked `GITHUB_REF_TYPE` /
|
|
||||||
`GITHUB_REF_NAME` env vars before committing.
|
|
||||||
|
|
||||||
**Verify on first dispatch:** trigger `workflow_dispatch` from the
|
|
||||||
Gitea UI, check the runner produces `:snapshot-<sha>` and pushes
|
|
||||||
multi-arch.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Slice 5 — Tasks.md + commit + push
|
|
||||||
|
|
||||||
1. `tasks.md`: tick P5-03; add a one-line note that goreleaser was
|
|
||||||
dropped in favour of Docker-only after a 2026-05-05 design pass
|
|
||||||
(link the spec).
|
|
||||||
2. `git add -A && git commit -m "p5-03: docker-only release path"`
|
|
||||||
(no Co-Authored-By trailer — CLAUDE.md rule).
|
|
||||||
3. `git push -u origin p5-03-docker-release`.
|
|
||||||
4. **Stop.** Do not open a PR. Wait for operator review.
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,473 +0,0 @@
|
|||||||
# P3 — Alerts (design)
|
|
||||||
|
|
||||||
> Phase 3 sub-spec covering the alerts engine, notification channels, and UI
|
|
||||||
> (P3-05 / P3-06 / P3-07).
|
|
||||||
>
|
|
||||||
> Wireframe: `_diag/p3-alerts-wireframe/wireframe.html`. Screenshots in the
|
|
||||||
> same directory. Spec brainstorm ran 2026-05-04; user approved all ten
|
|
||||||
> design decisions before this spec was written.
|
|
||||||
|
|
||||||
## Scope locked
|
|
||||||
|
|
||||||
Brainstorm decisions (in order asked):
|
|
||||||
|
|
||||||
1. **Rule model.** Hardcoded rule set, no operator-tunable thresholds in v1.
|
|
||||||
The engine knows about each rule type internally; per-rule config can land
|
|
||||||
later if/when an operator asks.
|
|
||||||
2. **Rule set.** Six rules: `backup_failed`, `forget_failed`, `prune_failed`,
|
|
||||||
`check_failed`, `stale_schedule`, `agent_offline`.
|
|
||||||
3. **Engine cadence.** Hybrid. Event hooks at the existing
|
|
||||||
`MarkJobFinished` and offline-sweeper sites for the immediate triggers;
|
|
||||||
one 60-second ticker handles stale-schedule detection and auto-resolution.
|
|
||||||
4. **Resolution.** Auto-resolve when the underlying condition clears + manual
|
|
||||||
Resolve at any time. Acknowledge is a separate "I've seen it" intermediate
|
|
||||||
state that does NOT close the alert.
|
|
||||||
5. **v1 channels.** Webhook + native ntfy + SMTP. Apprise deferred (the
|
|
||||||
channel plumbing accepts new kinds without reshaping). SMTP added as
|
|
||||||
a first-class channel post-brainstorm because the use case — overnight
|
|
||||||
alerts the operator wants to read in the morning rather than be pinged
|
|
||||||
on at 03:00 — is poorly served by ntfy's push model and clumsy via
|
|
||||||
webhook → email-gateway.
|
|
||||||
6. **Channel scope.** Global only. No per-host or per-severity routing in v1.
|
|
||||||
7. **Notification body.** Structured JSON for webhooks, formatted
|
|
||||||
title+body+click-URL for ntfy, plus a per-channel "Send test notification"
|
|
||||||
button with inline result feedback.
|
|
||||||
8. **Deduplication.** Open-alert uniqueness on `(host_id, kind)` with a
|
|
||||||
`last_seen_at` bump on every confirming tick. One notification per
|
|
||||||
occurrence; the UI shows "still happening · Ns ago" while a rule keeps
|
|
||||||
matching.
|
|
||||||
9. **Alert UI.** Top-level `/alerts` page (the existing nav stub becomes
|
|
||||||
real). Per-host vitals "Open alerts" cell links to `/alerts?host_id=...`.
|
|
||||||
Channel CRUD lives at `/settings/notifications`.
|
|
||||||
10. **Delivery semantics.** Best-effort fire-and-forget with a 5s timeout
|
|
||||||
per notification. Failures are logged but not retried. The alert row in
|
|
||||||
the DB is the source of truth.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
The subsystem is three loosely-coupled units behind one `AlertEngine`
|
|
||||||
goroutine:
|
|
||||||
|
|
||||||
```
|
|
||||||
┌───────────────────────────┐
|
|
||||||
event hooks ─────────────────►│ │
|
|
||||||
│ AlertEngine │ ──► raise/resolve
|
|
||||||
60s ticker ──────────────────►│ (rule evaluation) │ alert row
|
|
||||||
│ │
|
|
||||||
└────────────┬──────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────────────┐
|
|
||||||
│ notification.Hub │
|
|
||||||
│ (fire-and-forget) │
|
|
||||||
└──┬────────┬──────────┘
|
|
||||||
│ │
|
|
||||||
┌──────▼──┐ ┌──▼──────┐
|
|
||||||
│ Webhook │ │ Ntfy │ …future channels
|
|
||||||
└─────────┘ └─────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Component boundaries
|
|
||||||
|
|
||||||
| Component | Purpose | Depends on |
|
|
||||||
| ---------------------------------------- | ---------------------------------------------------------------------------------------- | -------------------------------------- |
|
|
||||||
| `internal/alert.Engine` | Owns the rule evaluation. Exposes `OnJobFinished`, `OnHostOffline`, `OnHostOnline` event hooks; runs a 60s ticker for stale-schedule + auto-resolution sweeps. Persists raises/resolves through the store. | store, notification.Hub, slog |
|
|
||||||
| `internal/alert.Rule` + per-rule files | Each of the six rules is a small struct with `Kind() string`, `Severity() string`, `MessageFor(ctx) string`. The engine iterates over a registered slice. | store models |
|
|
||||||
| `internal/notification.Hub` | Receives "alert raised/resolved/test" events; fans out to enabled channels in parallel; logs results to a new `notification_log` table. | store, channel adapters |
|
|
||||||
| `internal/notification.Channel` (iface) | Single method `Send(ctx, payload) error` with a 5s context for HTTP channels, 10s for SMTP. Three impls in v1: `webhookChannel`, `ntfyChannel`, `smtpChannel`. | http.Client; net/smtp + crypto/tls for SMTP |
|
|
||||||
| `internal/store/alerts.go` | CRUD on `alerts` table: `RaiseOrTouch(host_id, kind, severity, message)`, `Acknowledge(id, user)`, `Resolve(id, by user)`, `AutoResolve(host_id, kind)`, `ListAlerts(filter)`, plus the `last_seen_at` bump. | sqlite |
|
|
||||||
| `internal/store/notification_channels.go` | CRUD on `notification_channels` (new table) + `notification_log` (new table). | sqlite, crypto.AEAD (for secrets) |
|
|
||||||
| `internal/server/http/ui_alerts.go` | `/alerts` page handler + filter parsing + ack/resolve form actions. | store |
|
|
||||||
| `internal/server/http/ui_notifications.go` | `/settings/notifications` page + channel CRUD + "Send test" handler. | store, notification.Hub |
|
|
||||||
|
|
||||||
### Engine event shape
|
|
||||||
|
|
||||||
The engine runs as one goroutine per server process started in
|
|
||||||
`cmd/server/main.go`. It exposes a small set of channels other code writes to:
|
|
||||||
|
|
||||||
```go
|
|
||||||
type Engine struct {
|
|
||||||
store *store.Store
|
|
||||||
hub *notification.Hub
|
|
||||||
|
|
||||||
// Event channels (buffered, drop-on-full with a slog warning to keep
|
|
||||||
// hot paths non-blocking). The engine drains them on its own
|
|
||||||
// goroutine, evaluates the rule, and acts.
|
|
||||||
jobFinished chan jobFinishedEvent // from store.MarkJobFinished hook
|
|
||||||
hostOffline chan string // host_id; from offline sweeper
|
|
||||||
hostOnline chan string // host_id; from ws handler hello
|
|
||||||
|
|
||||||
// 60s ticker drives stale-schedule + auto-resolution sweeps.
|
|
||||||
tick *time.Ticker
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
The hot-path call sites (`store.MarkJobFinished`, `ws.handler` offline
|
|
||||||
sweep, `ws.handler` hello) push to these channels via a tiny
|
|
||||||
`Engine.Notify*` method that does a non-blocking send. The engine's own
|
|
||||||
goroutine handles every match — keeps mutation off the hot path.
|
|
||||||
|
|
||||||
### Rule catalogue
|
|
||||||
|
|
||||||
| Kind | Severity | Trigger | Auto-resolve when |
|
|
||||||
| ------------------- | -------- | ----------------------------------------------------------------------- | -------------------------------------------------- |
|
|
||||||
| `backup_failed` | warning | `MarkJobFinished` with kind=backup, status=failed | next backup for the same host succeeds |
|
|
||||||
| `forget_failed` | warning | `MarkJobFinished` with kind=forget, status=failed | next forget for the same host succeeds |
|
|
||||||
| `prune_failed` | warning | `MarkJobFinished` with kind=prune, status=failed | next prune for the same host succeeds |
|
|
||||||
| `check_failed` | critical | `MarkJobFinished` with kind=check, status=failed OR errors_found | next check for the same host succeeds without errors |
|
|
||||||
| `stale_schedule` | warning | 60s ticker: a schedule's next-fire time is more than 5 minutes in the past with no matching job since | next job for that schedule succeeds OR schedule deleted |
|
|
||||||
| `agent_offline` | warning | offline-sweeper marks the host offline AND the host has been offline > 15 min (engine checks `last_seen_at`) | hostOnline event for that host |
|
|
||||||
|
|
||||||
The 15-minute floor on `agent_offline` exists so a 30-second blip during
|
|
||||||
agent restart doesn't generate a notification storm. The store's existing
|
|
||||||
offline sweeper (`hosts.last_seen_at` with 90s threshold) already marks the
|
|
||||||
host offline; the engine sees the event but waits for the threshold before
|
|
||||||
raising.
|
|
||||||
|
|
||||||
### Dedup + last_seen_at
|
|
||||||
|
|
||||||
`store.RaiseOrTouch(host_id, kind, severity, message)`:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT id, last_seen_at FROM alerts
|
|
||||||
WHERE host_id = ? AND kind = ? AND resolved_at IS NULL
|
|
||||||
LIMIT 1;
|
|
||||||
```
|
|
||||||
|
|
||||||
- Found: `UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`,
|
|
||||||
return `(id, didRaise=false)`.
|
|
||||||
- Not found: `INSERT INTO alerts (id, host_id, kind, severity, message,
|
|
||||||
created_at, last_seen_at) VALUES (?, ?, ?, ?, ?, ?, ?)`, return
|
|
||||||
`(id, didRaise=true)`.
|
|
||||||
|
|
||||||
The engine fires a notification through the Hub only when `didRaise=true`.
|
|
||||||
Touch-only events keep the row's `last_seen_at` fresh so the UI can render
|
|
||||||
"still happening · Ns ago" without spamming the operator's phone.
|
|
||||||
|
|
||||||
### Notification payload shapes
|
|
||||||
|
|
||||||
**Webhook** — a single JSON envelope per event:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"event": "alert.raised",
|
|
||||||
"alert_id": "01KQT...",
|
|
||||||
"severity": "warning",
|
|
||||||
"kind": "backup_failed",
|
|
||||||
"host_id": "01KQ...",
|
|
||||||
"host_name": "alfa-01",
|
|
||||||
"message": "Backup 'system-config' failed: rest-server returned 401",
|
|
||||||
"raised_at": "2026-05-04T15:42:01Z",
|
|
||||||
"link": "https://restic-manager.example/alerts/01KQT..."
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
`event` is one of `alert.raised | alert.acknowledged | alert.resolved |
|
|
||||||
alert.test`. The same envelope shape is reused across events — operators
|
|
||||||
build one bridge, switch on `event` and `severity`.
|
|
||||||
|
|
||||||
**SMTP** — single-recipient plain-text email per channel. The channel
|
|
||||||
config carries the SMTP server credentials and a `to` address; one
|
|
||||||
channel = one recipient (or one distribution-list address). Operators
|
|
||||||
who want multiple recipients add multiple channels — keeps the config
|
|
||||||
flat and the failure modes per-recipient.
|
|
||||||
|
|
||||||
Subject pattern is hardcoded (no per-channel template in v1):
|
|
||||||
|
|
||||||
```
|
|
||||||
Subject: [restic-manager] [<severity>] <host_name>: <kind>
|
|
||||||
From: <configured-from-address>
|
|
||||||
To: <configured-to-address>
|
|
||||||
Date: <RFC 5322>
|
|
||||||
Message-ID: <alert_id@<server-host>>
|
|
||||||
|
|
||||||
<message line — same string the webhook/ntfy gets>
|
|
||||||
|
|
||||||
—
|
|
||||||
Raised at: 2026-05-04T15:42:01Z
|
|
||||||
Severity: warning
|
|
||||||
Host: alfa-01
|
|
||||||
Kind: backup_failed
|
|
||||||
|
|
||||||
Open in restic-manager:
|
|
||||||
https://restic-manager.example/alerts/01KQT...
|
|
||||||
|
|
||||||
(This message was sent by restic-manager. Acknowledge or resolve in the UI.)
|
|
||||||
```
|
|
||||||
|
|
||||||
The body is plain text only in v1 — no HTML alternative — both because
|
|
||||||
the data is already structured well enough as text and because HTML
|
|
||||||
email opens a long tail of rendering / sanitisation concerns. The
|
|
||||||
`Message-ID` includes the alert id so a thread-aware client can group
|
|
||||||
related events (raised → acknowledged → resolved) together.
|
|
||||||
|
|
||||||
Encryption:
|
|
||||||
- **STARTTLS** (default, port 587). Opportunistic upgrade. Most
|
|
||||||
operator-facing relays.
|
|
||||||
- **Implicit TLS** (port 465). Connect-then-TLS-handshake.
|
|
||||||
- **None** (port 25). Plain. Hidden behind a "Yes I understand" warning
|
|
||||||
on the form because the password goes over the wire.
|
|
||||||
|
|
||||||
Auth:
|
|
||||||
- **PLAIN** (RFC 4616) over TLS. Default and almost always what's wanted.
|
|
||||||
- **CRAM-MD5** (RFC 2195). Offered if the server advertises it, no UI
|
|
||||||
toggle — automatic.
|
|
||||||
- No OAuth2 / XOAUTH2 in v1; that's a real next step if Gmail-without-
|
|
||||||
app-passwords becomes a recurring ask.
|
|
||||||
|
|
||||||
Per-message timeout is 10s (vs 5s for HTTP channels) — STARTTLS
|
|
||||||
handshake + DATA over a slow link can legitimately take that long.
|
|
||||||
|
|
||||||
**Ntfy** — uses the standard publish format:
|
|
||||||
|
|
||||||
```
|
|
||||||
POST /<topic> HTTP/1.1
|
|
||||||
Host: <server>
|
|
||||||
Authorization: Bearer <access-token> (if configured)
|
|
||||||
Title: [warning] alfa-01 backup failed
|
|
||||||
Priority: 4
|
|
||||||
Tags: warning,backup_failed
|
|
||||||
Click: https://restic-manager.example/alerts/01KQT...
|
|
||||||
|
|
||||||
Backup 'system-config' failed: rest-server returned 401
|
|
||||||
```
|
|
||||||
|
|
||||||
Severity → priority mapping:
|
|
||||||
|
|
||||||
| Severity | Priority |
|
|
||||||
| --------- | -------- |
|
|
||||||
| info | 3 (default) |
|
|
||||||
| warning | 4 (high) |
|
|
||||||
| critical | 5 (urgent) |
|
|
||||||
|
|
||||||
Per-channel `default_priority` setting overrides for non-critical alerts;
|
|
||||||
critical always goes urgent regardless.
|
|
||||||
|
|
||||||
### Test notification
|
|
||||||
|
|
||||||
`POST /api/notifications/{channel_id}/test` builds a synthetic event
|
|
||||||
(severity=info, kind=test_notification, message="Test from
|
|
||||||
restic-manager", link to the channel's edit page) and runs it through the
|
|
||||||
real send path. Returns `{ok: bool, latency_ms: int, status_code?: int,
|
|
||||||
error?: string}`. UI renders the green ✓ / red ✗ feedback inline.
|
|
||||||
|
|
||||||
## Routes added
|
|
||||||
|
|
||||||
| Method | Path | Purpose |
|
|
||||||
| ------- | ----------------------------------------------------- | ------------------------------------------------------------- |
|
|
||||||
| GET | `/alerts` | Fleet alerts list with filters (`?status=open&severity=warning&host_id=...&q=...`) |
|
|
||||||
| POST | `/alerts/{id}/acknowledge` | Mark alert acknowledged (HTMX form) |
|
|
||||||
| POST | `/alerts/{id}/resolve` | Manual resolve (HTMX form) |
|
|
||||||
| GET | `/settings/notifications` | Channel list page |
|
|
||||||
| GET | `/settings/notifications/new` | Channel kind picker + empty form |
|
|
||||||
| POST | `/settings/notifications/new` | Validate + create + redirect |
|
|
||||||
| GET | `/settings/notifications/{id}/edit` | Channel edit form |
|
|
||||||
| POST | `/settings/notifications/{id}/edit` | Validate + update |
|
|
||||||
| POST | `/settings/notifications/{id}/delete` | Delete channel (typed-confirm name in the form) |
|
|
||||||
| POST | `/api/notifications/{id}/test` | Fire test notification, return JSON result |
|
|
||||||
| GET | `/api/alerts` | JSON list (mirrors the UI filters) for future REST callers |
|
|
||||||
|
|
||||||
## Data model
|
|
||||||
|
|
||||||
### Migration 0013 — alerts.last_seen_at
|
|
||||||
|
|
||||||
```sql
|
|
||||||
ALTER TABLE alerts ADD COLUMN last_seen_at TEXT;
|
|
||||||
UPDATE alerts SET last_seen_at = created_at WHERE last_seen_at IS NULL;
|
|
||||||
```
|
|
||||||
|
|
||||||
Existing alerts (currently zero in production — nothing writes them yet)
|
|
||||||
get `last_seen_at = created_at`. Column is nullable for forwards-compat
|
|
||||||
with rows from the alert-engine-pre-bump period.
|
|
||||||
|
|
||||||
### Migration 0014 — notification_channels + notification_log
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE notification_channels (
|
|
||||||
id TEXT PRIMARY KEY,
|
|
||||||
kind TEXT NOT NULL CHECK (kind IN ('webhook', 'ntfy', 'smtp')),
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
enabled INTEGER NOT NULL DEFAULT 1 CHECK (enabled IN (0, 1)),
|
|
||||||
config BLOB NOT NULL, -- AEAD-encrypted JSON; per-kind shape
|
|
||||||
default_priority TEXT, -- ntfy only; null for webhook + smtp
|
|
||||||
created_at TEXT NOT NULL,
|
|
||||||
updated_at TEXT NOT NULL,
|
|
||||||
last_fired_at TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX notification_channels_enabled ON notification_channels(enabled) WHERE enabled = 1;
|
|
||||||
|
|
||||||
CREATE TABLE notification_log (
|
|
||||||
id TEXT PRIMARY KEY,
|
|
||||||
channel_id TEXT NOT NULL REFERENCES notification_channels(id) ON DELETE CASCADE,
|
|
||||||
alert_id TEXT REFERENCES alerts(id) ON DELETE SET NULL,
|
|
||||||
event TEXT NOT NULL, -- alert.raised | alert.acknowledged | alert.resolved | alert.test
|
|
||||||
ok INTEGER NOT NULL CHECK (ok IN (0, 1)),
|
|
||||||
status_code INTEGER,
|
|
||||||
latency_ms INTEGER,
|
|
||||||
error TEXT,
|
|
||||||
fired_at TEXT NOT NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX notification_log_channel ON notification_log(channel_id, fired_at DESC);
|
|
||||||
CREATE INDEX notification_log_alert ON notification_log(alert_id);
|
|
||||||
```
|
|
||||||
|
|
||||||
`config` is an AEAD-encrypted JSON blob — bearer tokens for webhooks and
|
|
||||||
access tokens for ntfy live there. Per-kind config shapes:
|
|
||||||
|
|
||||||
```go
|
|
||||||
type webhookConfig struct {
|
|
||||||
URL string `json:"url"`
|
|
||||||
BearerToken string `json:"bearer_token,omitempty"`
|
|
||||||
HeaderName string `json:"header_name,omitempty"`
|
|
||||||
HeaderValue string `json:"header_value,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type ntfyConfig struct {
|
|
||||||
ServerURL string `json:"server_url"` // default https://ntfy.sh
|
|
||||||
Topic string `json:"topic"`
|
|
||||||
AccessToken string `json:"access_token,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type smtpConfig struct {
|
|
||||||
Host string `json:"host"` // e.g. smtp.example.com
|
|
||||||
Port int `json:"port"` // default 587 (STARTTLS), 465 (TLS), 25 (none)
|
|
||||||
Encryption string `json:"encryption"` // "starttls" | "tls" | "none"
|
|
||||||
Username string `json:"username"`
|
|
||||||
Password string `json:"password"` // sensitive — AEAD-encrypted with the rest of config
|
|
||||||
From string `json:"from"` // RFC 5322 address; "alerts@example.com" or "Restic-Manager <alerts@…>"
|
|
||||||
To string `json:"to"` // single recipient or distribution-list address; v1 = one channel = one to-line
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Engine state
|
|
||||||
|
|
||||||
The engine itself is stateless beyond the channels it owns; all
|
|
||||||
persisted state is in the existing `alerts` table + the new
|
|
||||||
`notification_log` table. A process restart re-evaluates from scratch:
|
|
||||||
on next tick the stale-schedule + auto-resolution sweeps catch up with
|
|
||||||
whatever happened during the downtime. No outbox to drain.
|
|
||||||
|
|
||||||
## UI templates
|
|
||||||
|
|
||||||
| Template | Purpose |
|
|
||||||
| ----------------------------------------- | ------------------------------------------------------ |
|
|
||||||
| `web/templates/pages/alerts.html` | Fleet alerts page |
|
|
||||||
| `web/templates/partials/alert_row.html` | One alert row (used by both list and detail-fragment swap) |
|
|
||||||
| `web/templates/pages/settings.html` | Settings shell with Notifications / Users / Auth sub-tabs |
|
|
||||||
| `web/templates/pages/notifications.html` | Channel list (Notifications sub-tab body) |
|
|
||||||
| `web/templates/pages/notification_edit.html` | Channel kind picker + per-kind form + test button + payload preview |
|
|
||||||
| `web/templates/partials/crit_banner.html` | Dashboard top-of-page banner |
|
|
||||||
| `web/templates/partials/nav.html` | Existing — gain a `data-alerts-count` attribute on the Alerts tab so the badge auto-updates |
|
|
||||||
|
|
||||||
The Settings shell + Notifications sub-tab is the new chrome the wireframe
|
|
||||||
introduced; Users + Authentication tabs are placeholder links that 404 in
|
|
||||||
v1 (or render an "Lands later" notice). Same pattern P2R-02 used for
|
|
||||||
inert sub-tabs.
|
|
||||||
|
|
||||||
## Tests (target coverage)
|
|
||||||
|
|
||||||
- `internal/alert/engine_test.go` — rule firing per kind: backup_failed
|
|
||||||
raises on `MarkJobFinished(kind=backup, status=failed)`; touch-only on
|
|
||||||
the second failure for the same host (no second notification);
|
|
||||||
auto-resolve on next success.
|
|
||||||
- `internal/alert/agent_offline_test.go` — `OnHostOffline` emits without
|
|
||||||
raising until the 15-min floor; `OnHostOnline` clears the alert.
|
|
||||||
- `internal/alert/stale_schedule_test.go` — synthetic schedule whose next
|
|
||||||
fire is in the past triggers; resets when a job lands.
|
|
||||||
- `internal/notification/webhook_test.go` — payload shape pinned;
|
|
||||||
authorisation header sent when bearer set; custom header echoed; 5s
|
|
||||||
timeout enforced; error in `notification_log`.
|
|
||||||
- `internal/notification/ntfy_test.go` — title/priority/tags/click headers
|
|
||||||
match the severity mapping; access token sent as `Authorization: Bearer
|
|
||||||
<token>`; default priority overridden by severity for critical.
|
|
||||||
- `internal/notification/smtp_test.go` — round-trip against a local
|
|
||||||
`net/smtp.NewServer`-style fake (or `mhog`/MailHog if convenient):
|
|
||||||
STARTTLS handshake completes against a self-signed cert; PLAIN auth
|
|
||||||
uses configured creds; subject + from + to + body bytes match the
|
|
||||||
spec'd format; Message-ID contains the alert id; 10s timeout enforced;
|
|
||||||
failure path (auth refused) lands in `notification_log` with the
|
|
||||||
server's error string.
|
|
||||||
- `internal/server/http/ui_alerts_test.go` — page renders with filters
|
|
||||||
applied; ack/resolve POSTs flip the row + write audit; HX-Redirect
|
|
||||||
bounces back to the filtered list.
|
|
||||||
- `internal/server/http/ui_notifications_test.go` — CRUD happy paths,
|
|
||||||
validation re-render, secrets-encrypted-at-rest assertion (load row,
|
|
||||||
decrypt, compare), test-button hits the real send path against a
|
|
||||||
test http.Server.
|
|
||||||
- Migration 0013 + 0014 round-trip tested via `store.Open` on a fresh
|
|
||||||
db.
|
|
||||||
|
|
||||||
## Playwright sweep
|
|
||||||
|
|
||||||
End-of-phase sweep mirrors the P2R-02 / P3-restore pattern:
|
|
||||||
|
|
||||||
1. Login → `/alerts` (initially empty) → see "All clear · last alert
|
|
||||||
never" empty state.
|
|
||||||
2. Trigger a fake-failed-backup via `POST /api/hosts/{id}/jobs` against a
|
|
||||||
host with a deliberately-wrong rest-server URL. Wait for the
|
|
||||||
`backup_failed` alert to appear in the list within ~2s of the job
|
|
||||||
finishing.
|
|
||||||
3. Acknowledge → row tints + ack actor visible.
|
|
||||||
4. Take the agent offline (`systemctl stop`); wait 15 min OR mock
|
|
||||||
`last_seen_at` to 16 min ago via the test harness; confirm
|
|
||||||
`agent_offline` alert raises once.
|
|
||||||
5. Restart the agent → `agent_offline` auto-resolves; `backup_failed` is
|
|
||||||
still open.
|
|
||||||
6. Configure a webhook channel pointing at a local test sink; click "Send
|
|
||||||
test" → green ✓.
|
|
||||||
7. Configure a ntfy channel pointing at a local sink → click "Send test"
|
|
||||||
→ green ✓.
|
|
||||||
8. Configure an SMTP channel pointing at a local MailHog (Docker, port
|
|
||||||
1025, no TLS for the local-only sweep) → click "Send test" → green ✓
|
|
||||||
→ MailHog UI at :8025 shows the test email with the right subject
|
|
||||||
and Message-ID.
|
|
||||||
9. Trigger a fresh failed backup → all three channels receive the
|
|
||||||
notification (verified from sink logs + MailHog inbox);
|
|
||||||
`notification_log` has three rows `event=alert.raised, ok=true`.
|
|
||||||
10. Manually Resolve the open `backup_failed`; confirm all three channels
|
|
||||||
receive `event=alert.resolved`.
|
|
||||||
11. Critical-severity test: trigger `check_failed` (mocked) → dashboard
|
|
||||||
banner appears; clicking it lands on `/alerts?severity=critical&status=open`.
|
|
||||||
12. Empty the alerts again → banner disappears.
|
|
||||||
|
|
||||||
Screenshots into `_diag/p3-alerts-sweep/`. End-to-end clean, zero console
|
|
||||||
errors, before handing back.
|
|
||||||
|
|
||||||
## What does NOT change
|
|
||||||
|
|
||||||
- Existing chrome/templates beyond the small additions noted above.
|
|
||||||
- Existing `alerts.severity` CHECK (`info`/`warning`/`critical`) — already
|
|
||||||
the right shape; no migration needed for that.
|
|
||||||
- Audit log writer pattern — engine writes audit rows for ack/resolve
|
|
||||||
the same way every other state-changing handler does.
|
|
||||||
- The agent. Alerts are entirely a server concern; the agent doesn't
|
|
||||||
know they exist.
|
|
||||||
|
|
||||||
## Open questions / explicit non-goals
|
|
||||||
|
|
||||||
- **Per-rule cooldowns / re-raise on long-running issues.** Out of scope
|
|
||||||
(brainstorm question 8 ruled this out). Operators see "still happening"
|
|
||||||
in the UI; they don't get a reminder ping.
|
|
||||||
- **SMTP HTML emails.** v1 is plain text only — operators wanting rich
|
|
||||||
rendering can deploy a webhook → mail-merge bridge, or wait for a v2
|
|
||||||
template engine. The Message-ID threading + plain text body should be
|
|
||||||
enough for almost every overnight-digest workflow.
|
|
||||||
- **SMTP OAuth2 / XOAUTH2.** Out of scope. Gmail / Microsoft 365 with
|
|
||||||
modern OAuth requires an `app password` workaround in v1. Native
|
|
||||||
XOAUTH2 lands when an operator asks (or when Google starts refusing
|
|
||||||
app passwords for non-business accounts in earnest).
|
|
||||||
- **Multi-recipient SMTP channels.** A channel = one `To`. Operators
|
|
||||||
wanting multiple recipients add multiple channels. Keeps failure
|
|
||||||
attribution per-recipient.
|
|
||||||
- **Apprise sidecar integration.** Deferred per brainstorm. The
|
|
||||||
`Channel` interface accepts a third impl without reshaping when we get
|
|
||||||
there.
|
|
||||||
- **Per-host or per-severity channel routing.** Out of scope. Likely
|
|
||||||
next step if operators ask: a `min_severity` field on the channel row.
|
|
||||||
- **Snooze / mute.** Out of scope. Acknowledge is the closest analogue;
|
|
||||||
full silence-windows would need a new table and is YAGNI for v1.
|
|
||||||
- **PagerDuty / OpsGenie.** Both have webhook receivers; operators wire
|
|
||||||
them via the webhook channel today.
|
|
||||||
- **Alert "rules" UI.** No CRUD; the rule set is hardcoded.
|
|
||||||
@@ -1,342 +0,0 @@
|
|||||||
# P3 — Restore (design)
|
|
||||||
|
|
||||||
> Phase 3 sub-spec covering single-host restore (P3-01, P3-02, P3-03, P3-09).
|
|
||||||
> P3-04 (cross-host restore) is deferred to a new "Future / unscheduled"
|
|
||||||
> section in `tasks.md` — disaster recovery is already covered by re-enrolling
|
|
||||||
> a replacement host with the same repo credentials.
|
|
||||||
>
|
|
||||||
> Wireframe: `_diag/p3-restore-wizard/wireframe.html`. Screenshot:
|
|
||||||
> `_diag/p3-restore-wizard/01-full-wizard.png`.
|
|
||||||
|
|
||||||
## Scope locked
|
|
||||||
|
|
||||||
Brainstorm decisions (in order asked):
|
|
||||||
|
|
||||||
1. **In-place vs new-directory.** Default is a new directory under
|
|
||||||
`/var/restic-restore/<job-id>/`. An "Restore in place (overwrite original
|
|
||||||
paths)" toggle is gated by typed-confirmation of the host name, mirroring
|
|
||||||
the repo re-init pattern.
|
|
||||||
2. **Path-selection granularity.** Tree browser as the path selector, lazy-
|
|
||||||
loaded via `restic ls --json <snapshot> <path>` per directory expansion.
|
|
||||||
3. **Cross-host restore (P3-04).** Out of scope this phase. Move to
|
|
||||||
"Future / unscheduled" in `tasks.md`. The disaster-recovery case is covered
|
|
||||||
by the standard enrolment flow: stand up a replacement host, paste the
|
|
||||||
original repo creds at enrolment, snapshots reappear, restore is
|
|
||||||
same-host.
|
|
||||||
4. **Snapshot diff (P3-09).** Diff-as-a-job. New `JobDiff` JobKind dispatched
|
|
||||||
like every other agent operation. Output streams as `log.stream` and
|
|
||||||
renders on the live job log page.
|
|
||||||
5. **Wizard entry points.** Top-level "Restore" button on host detail
|
|
||||||
(`/hosts/{id}/restore`, opens wizard at step 1) plus a per-snapshot
|
|
||||||
Restore action on snapshot rows (`/hosts/{id}/snapshots/{sid}/restore`,
|
|
||||||
skips step 1).
|
|
||||||
6. **Wizard interaction model.** Single-page, sections progressively enable;
|
|
||||||
tree-browser nodes lazy-load via HTMX partials. No `restore_drafts` table.
|
|
||||||
7. **Tree-browser data path.** Synchronous WS RPC (`tree.list` ↔
|
|
||||||
`tree.list.result`, correlation-ID) plus a per-wizard-session in-memory
|
|
||||||
cache keyed by `{snapshot_id, path}` with ~30-min TTL.
|
|
||||||
8. **Restore progress UI.** Restore-specific job-page variant: files-restored
|
|
||||||
/ bytes-restored / throughput / ETA / current-file display, driven by
|
|
||||||
restic restore's JSON status events surfaced through `job.progress`.
|
|
||||||
9. **Permissions/ownership.** Policy, not toggle. In-place restore preserves
|
|
||||||
original ownership; new-directory restore drops ownership
|
|
||||||
(`--no-ownership`).
|
|
||||||
10. **Concurrency.** Single-flight per host (one job at a time across all
|
|
||||||
kinds). Plus a real cancel-job feature: `command.cancel` envelope, agent
|
|
||||||
kills the `restic` subprocess via context cancel (SIGTERM, SIGKILL after
|
|
||||||
grace), server transitions the job to `cancelled`. The "Cancel" button
|
|
||||||
already in the `job_detail` template becomes real for any running job
|
|
||||||
kind.
|
|
||||||
11. **Audit + safety.** Audit row on every restore dispatch (`host.restore`
|
|
||||||
with snapshot ID, paths, target, in-place flag). Recent-restores panel
|
|
||||||
on the host page surfacing the latest restore job alongside last-backup
|
|
||||||
and last-init signals. Role gate deferred to P4-03.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
Restore composes from existing primitives plus three new pieces:
|
|
||||||
|
|
||||||
- **New JobKind values**: `JobRestore`, `JobDiff`. Dispatcher cases mirror
|
|
||||||
the prune/check pattern. Agent-side handlers wrap `restic.RunRestore` and
|
|
||||||
`restic.RunDiff` (new methods on the `restic` package).
|
|
||||||
- **New WS RPC**: `tree.list` request (`{snapshot_id, path}`) ↔
|
|
||||||
`tree.list.result` reply (`{entries: [{name, type, size}], ...}` or
|
|
||||||
`{error}`). Reuses existing correlation-ID infrastructure from P1-09. No
|
|
||||||
`jobs` row.
|
|
||||||
- **New cancel surface**: `command.cancel` request (`{job_id}`), agent
|
|
||||||
cancels the running subprocess context, returns `command.ack` + `job.finished`
|
|
||||||
with status `cancelled`. Server endpoint `POST /api/jobs/{id}/cancel`
|
|
||||||
bridges UI button → WS envelope.
|
|
||||||
|
|
||||||
Everything else (job lifecycle, log streaming, progress envelope, snapshot
|
|
||||||
listing, audit log writer, host_chrome partial, danger-zone typed-confirmation)
|
|
||||||
already exists and is reused verbatim.
|
|
||||||
|
|
||||||
### Component boundaries
|
|
||||||
|
|
||||||
| Component | Purpose | Depends on |
|
|
||||||
| ---------------------------------- | ---------------------------------------------------- | ----------------------------------------- |
|
|
||||||
| `internal/restic.RunRestore` | Run `restic restore` with paths + target + ownership | `restic.Env` |
|
|
||||||
| `internal/restic.RunDiff` | Run `restic diff --json a b` | `restic.Env` |
|
|
||||||
| `internal/agent/runner` cases | Dispatch `JobRestore` / `JobDiff` jobs | `restic.Run*`, hooks (skipped: backup-only) |
|
|
||||||
| `internal/agent/runner` cancel hook | Wire WS `command.cancel` → ctx.CancelFunc per job | runner job map |
|
|
||||||
| `internal/agent/runner` tree-list | Sync RPC handler: `restic ls --json` for one path | `restic.Env` |
|
|
||||||
| `internal/server/ws/cancel.go` | Validate + send `command.cancel` envelope | hub.Send, store.UpdateJobStatus |
|
|
||||||
| `internal/server/ws/tree.go` | RPC mediator: `tree.list` request → reply, with cache | hub.SendRPC, in-memory cache |
|
|
||||||
| `internal/server/http/restore.go` | Wizard routes + dispatch endpoint | store, ws, audit |
|
|
||||||
| `internal/server/http/diff.go` | Snapshot-diff dispatch endpoint | store, ws |
|
|
||||||
| `internal/server/http/cancel.go` | `POST /api/jobs/{id}/cancel` | ws |
|
|
||||||
| `web/templates/pages/host_restore.html` | Wizard page | host_chrome partial |
|
|
||||||
| `web/templates/partials/tree_node.html` | Lazy-loaded tree node fragment for HTMX swap | — |
|
|
||||||
| `web/templates/pages/job_detail.html` | Restore-kind progress widget (variant) | existing job_detail |
|
|
||||||
|
|
||||||
### Data flow — wizard happy path
|
|
||||||
|
|
||||||
```
|
|
||||||
operator
|
|
||||||
├─ GET /hosts/{id}/restore
|
|
||||||
│ server renders wizard shell, snapshot table from store.ListSnapshotsByHost
|
|
||||||
│
|
|
||||||
├─ click snapshot row (or arrives via /hosts/{id}/snapshots/{sid}/restore)
|
|
||||||
│ wizard advances to step 2, snapshot summary card rendered
|
|
||||||
│
|
|
||||||
├─ expand a tree node (chevron click)
|
|
||||||
│ HTMX GET /hosts/{id}/restore/tree?snapshot={sid}&path=/etc
|
|
||||||
│ server checks per-session cache (keyed by sid+path)
|
|
||||||
│ hit → render tree_node fragment from cache
|
|
||||||
│ miss → hub.SendRPC(host_id, "tree.list", {sid, path}) → wait reply
|
|
||||||
│ cache result, render tree_node fragment
|
|
||||||
│
|
|
||||||
├─ tick file/dir checkboxes (form state, no round-trip)
|
|
||||||
│
|
|
||||||
├─ pick target radio (and optionally type host name to unlock in-place)
|
|
||||||
│
|
|
||||||
└─ POST /hosts/{id}/restore (form submit)
|
|
||||||
server validates: ≥1 path, target mode, in-place ⇒ host name match
|
|
||||||
write audit row host.restore
|
|
||||||
store.CreateJob{kind=restore, payload={snapshot_id, paths, target, in_place}}
|
|
||||||
hub.Send(host_id, "command.run", {job_id, kind=restore, payload})
|
|
||||||
HX-Redirect: /jobs/{job_id}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Data flow — agent restore execution
|
|
||||||
|
|
||||||
```
|
|
||||||
agent.runner receives command.run kind=restore
|
|
||||||
├─ check single-flight: if r.activeJobID != "" → reply busy
|
|
||||||
│ (server queues to pending_runs only for kind=backup; restore returns busy)
|
|
||||||
├─ allocate ctx, ctxCancel — store cancelFunc against job_id in r.cancels
|
|
||||||
├─ sendStarted(job_id, JobRestore, now)
|
|
||||||
├─ build target path: if in_place → "/" else "/var/restic-restore/<job_id>/"
|
|
||||||
├─ build flags: paths from payload, --no-ownership when !in_place
|
|
||||||
├─ restic.RunRestore(ctx, env, snapshot_id, paths, target, in_place):
|
|
||||||
│ restic restore <sid> --target <path> [--no-ownership] -- <p1> <p2> ...
|
|
||||||
│ parse stdout JSON: forward "status" → job.progress (1Hz throttle), "summary" → final
|
|
||||||
├─ on success: sendFinished(job_id, succeeded, exit=0)
|
|
||||||
├─ on ctx.Err() == context.Canceled: sendFinished(job_id, cancelled, exit=130)
|
|
||||||
└─ delete cancel func from r.cancels
|
|
||||||
```
|
|
||||||
|
|
||||||
### Data flow — cancel
|
|
||||||
|
|
||||||
```
|
|
||||||
operator clicks Cancel on /jobs/{id} (running)
|
|
||||||
POST /api/jobs/{id}/cancel
|
|
||||||
server: lookup job, ensure status=running, find host
|
|
||||||
hub.Send(host_id, "command.cancel", {job_id})
|
|
||||||
→ agent.runner receives command.cancel
|
|
||||||
cancelFunc, ok := r.cancels[job_id]
|
|
||||||
ok && cancelFunc()
|
|
||||||
→ restic subprocess context done → exec.Cmd kills via SIGTERM
|
|
||||||
→ if still alive after 5s grace → SIGKILL
|
|
||||||
→ runner sendFinished(job_id, cancelled, exit=130)
|
|
||||||
→ server receives job.finished status=cancelled, persists, broadcasts
|
|
||||||
→ browser refresh shows cancelled state
|
|
||||||
```
|
|
||||||
|
|
||||||
The cancel surface is independently useful for any kind (prune/check/backup) —
|
|
||||||
not gated to restore. The button already in `job_detail.html` becomes real.
|
|
||||||
|
|
||||||
### Tree-list RPC details
|
|
||||||
|
|
||||||
New WS message types (added to `internal/api/messages.go`):
|
|
||||||
|
|
||||||
```
|
|
||||||
type TreeListRequestPayload struct {
|
|
||||||
SnapshotID string `json:"snapshot_id"`
|
|
||||||
Path string `json:"path"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type TreeListEntry struct {
|
|
||||||
Name string `json:"name"`
|
|
||||||
Type string `json:"type"` // "dir" | "file" | "symlink"
|
|
||||||
Size int64 `json:"size,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type TreeListResultPayload struct {
|
|
||||||
SnapshotID string `json:"snapshot_id"`
|
|
||||||
Path string `json:"path"`
|
|
||||||
Entries []TreeListEntry `json:"entries,omitempty"`
|
|
||||||
Error string `json:"error,omitempty"`
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Server-side mediator (`ws.SendRPC`) takes a request envelope, registers the
|
|
||||||
correlation ID in a pending map, sends, blocks on a per-call channel until
|
|
||||||
the matching reply arrives (or 30s timeout). The pattern is small enough
|
|
||||||
to inline in `internal/server/ws/rpc.go` as a generic helper — future
|
|
||||||
synchronous RPCs reuse it.
|
|
||||||
|
|
||||||
In-memory cache: `map[sessionID]map[cacheKey]TreeListResultPayload` with
|
|
||||||
`cacheKey = snapshot_id + "\x00" + path`. Session ID minted per wizard
|
|
||||||
load (HTTP-only cookie scoped to `/hosts/{id}/restore/tree`, lifetime 30
|
|
||||||
min). On wizard close (browser navigation away) the entry expires
|
|
||||||
naturally. No persistence, no migration.
|
|
||||||
|
|
||||||
Agent handler runs `restic ls --json <sid> <path>` (non-recursive — restic
|
|
||||||
defaults to recursive but `restic ls` accepts `--long` and a path filter;
|
|
||||||
parse output line-by-line and emit only direct children of `path`). 60s
|
|
||||||
context timeout, mirroring existing `restic snapshots` invocation.
|
|
||||||
|
|
||||||
### Restore payload
|
|
||||||
|
|
||||||
`api.CommandRunPayload` gains a nested optional `restore` field:
|
|
||||||
|
|
||||||
```
|
|
||||||
type RestorePayload struct {
|
|
||||||
SnapshotID string `json:"snapshot_id"`
|
|
||||||
Paths []string `json:"paths"` // absolute paths inside the snapshot
|
|
||||||
InPlace bool `json:"in_place"`
|
|
||||||
TargetDir string `json:"target_dir"` // empty when in_place=true
|
|
||||||
PreserveOwner bool `json:"preserve_owner"` // mirrors policy: in_place=>true, else=>false
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
The payload is set by the server when dispatching `JobRestore` and ignored
|
|
||||||
on every other kind. Wire-shape test pinned in `wire_test.go`.
|
|
||||||
|
|
||||||
### Diff payload
|
|
||||||
|
|
||||||
`api.CommandRunPayload` gains:
|
|
||||||
|
|
||||||
```
|
|
||||||
type DiffPayload struct {
|
|
||||||
SnapshotA string `json:"snapshot_a"`
|
|
||||||
SnapshotB string `json:"snapshot_b"`
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Set on `JobDiff`. Output is plain `restic diff --json <a> <b>` forwarded as
|
|
||||||
`log.stream` lines. Job page renders unchanged — operator reads the diff
|
|
||||||
output directly.
|
|
||||||
|
|
||||||
### Recent-restores panel
|
|
||||||
|
|
||||||
A small panel rendered on the host detail page below the existing init-status
|
|
||||||
line:
|
|
||||||
|
|
||||||
```
|
|
||||||
last restore: succeeded 2h ago · job f73ab4c1… · 3 files to /var/restic-restore/...
|
|
||||||
```
|
|
||||||
|
|
||||||
Backed by a new `store.LatestJobByKind(host_id, JobRestore)` query (mirroring
|
|
||||||
the existing `store.LatestJobByKind` already used for init/forget/prune/check
|
|
||||||
in P2R-06). One template addition in `host_chrome.html` next to the
|
|
||||||
`InitStatus` block.
|
|
||||||
|
|
||||||
## Routes added
|
|
||||||
|
|
||||||
| Method | Path | Purpose |
|
|
||||||
| ------- | --------------------------------------------------------- | ----------------------------------------------------------- |
|
|
||||||
| GET | `/hosts/{id}/restore` | Wizard shell (step 1 = snapshot picker) |
|
|
||||||
| GET | `/hosts/{id}/snapshots/{sid}/restore` | Wizard shell with snapshot pre-selected (skips step 1) |
|
|
||||||
| GET | `/hosts/{id}/restore/tree` | HTMX partial: tree node listing for `?snapshot=&path=` |
|
|
||||||
| POST | `/hosts/{id}/restore` | Validate + dispatch restore job, redirect to live job page |
|
|
||||||
| POST | `/api/hosts/{id}/snapshots/diff` | Dispatch a diff job for `{snapshot_a, snapshot_b}` |
|
|
||||||
| POST | `/api/jobs/{id}/cancel` | Send `command.cancel` to host, transition job → cancelled |
|
|
||||||
|
|
||||||
## Migrations
|
|
||||||
|
|
||||||
None. Restore + diff piggyback on the existing `jobs` table (their `kind` is
|
|
||||||
new but the schema already accepts arbitrary kind strings — there's no
|
|
||||||
CHECK constraint on `kind`). The cancel feature uses the existing
|
|
||||||
`JobCancelled` terminal status. The tree-list cache lives in process memory.
|
|
||||||
|
|
||||||
## Tests (target coverage)
|
|
||||||
|
|
||||||
- `internal/restic/restore_test.go` — `RunRestore` invocation builds the
|
|
||||||
expected argv (paths, --target, --no-ownership flag presence, in-place
|
|
||||||
variant); JSON status parsing → `BackupStatus`-shaped progress envelopes.
|
|
||||||
- `internal/restic/diff_test.go` — `RunDiff` argv shape and JSON forwarding.
|
|
||||||
- `internal/agent/runner/restore_test.go` — happy path, cancel mid-run
|
|
||||||
produces `cancelled` finished, in-place vs new-directory dispatch,
|
|
||||||
single-flight rejects when another job is running.
|
|
||||||
- `internal/agent/runner/tree_test.go` — `tree.list` handler returns
|
|
||||||
direct children for a synthetic restic ls output, surfaces error on
|
|
||||||
missing snapshot.
|
|
||||||
- `internal/server/ws/rpc_test.go` — `SendRPC` correlation matching,
|
|
||||||
timeout, concurrent calls.
|
|
||||||
- `internal/server/http/restore_test.go` — wizard renders with snapshots,
|
|
||||||
POST validates ≥1 path + in-place host-name match, audit row written,
|
|
||||||
job dispatched with correct payload, in-place without typed-confirm
|
|
||||||
re-renders form with input intact and an error.
|
|
||||||
- `internal/server/http/diff_test.go` — POST dispatches `JobDiff`,
|
|
||||||
snapshot IDs validated against the host's snapshot list.
|
|
||||||
- `internal/server/http/cancel_test.go` — POST cancel happy path
|
|
||||||
(running → cancelled), 4xx for non-running jobs, 4xx when host offline.
|
|
||||||
- `internal/server/http/restore_e2e_test.go` — happy path: GET wizard,
|
|
||||||
expand `/etc` (HTMX call returns expected fragment), submit, follow
|
|
||||||
HX-Redirect to job page, see status.
|
|
||||||
- `web/templates/pages/host_restore_test.go` (template-render test) —
|
|
||||||
wizard renders all four sections; in-place card disabled until typed
|
|
||||||
confirm.
|
|
||||||
|
|
||||||
## Playwright iteration / sweep
|
|
||||||
|
|
||||||
A Playwright sweep at the end (mirroring P2R-02 Slice 6) runs against the
|
|
||||||
local smoke server with a real agent enrolled. Steps:
|
|
||||||
|
|
||||||
1. Login → navigate to alfa-01 host → click Restore.
|
|
||||||
2. Wizard step 1: pick the most recent snapshot.
|
|
||||||
3. Wizard step 2: expand a directory two levels, tick three files,
|
|
||||||
verify tally updates.
|
|
||||||
4. Wizard step 3: leave default new-directory.
|
|
||||||
5. Wizard step 4: dispatch.
|
|
||||||
6. Land on live job page, see progress widget animating, see log lines.
|
|
||||||
7. Click Cancel mid-flight, verify status transitions to cancelled and
|
|
||||||
the agent's subprocess actually died (log line `signal: killed` or exit
|
|
||||||
130).
|
|
||||||
8. Repeat with in-place mode: type host name, dispatch, verify red
|
|
||||||
primary button, verify files actually overwritten on host.
|
|
||||||
9. Snapshot diff: navigate to snapshots, pick two, dispatch diff, see
|
|
||||||
diff output streamed.
|
|
||||||
10. Screenshots into `_diag/p3-restore-sweep/`.
|
|
||||||
|
|
||||||
End-to-end clean, zero console errors, before handing back.
|
|
||||||
|
|
||||||
## What does NOT change
|
|
||||||
|
|
||||||
- `host_chrome.html` only grows the recent-restores line; sub-tab list
|
|
||||||
unchanged (Restore is a top-level button on the host page, not a sub-tab).
|
|
||||||
- `enrollment.go`, schedule reconciliation, source-group CRUD, repo
|
|
||||||
maintenance ticker, hook execution — none of these are touched.
|
|
||||||
- The CLAUDE.md restage block applies as-is when the agent binary changes
|
|
||||||
(it does — runner gains restore/diff/cancel/tree handlers). The unit
|
|
||||||
file does not change.
|
|
||||||
|
|
||||||
## Open questions / explicit non-goals
|
|
||||||
|
|
||||||
- **Restore preview / dry-run.** Restic doesn't have a dry-run for restore.
|
|
||||||
Out of scope.
|
|
||||||
- **Resumable restore.** Restic restore is idempotent per-file but not
|
|
||||||
resumable mid-stream from where it left off. If a restore is cancelled,
|
|
||||||
the operator re-runs (files already written are overwritten). No state
|
|
||||||
to track.
|
|
||||||
- **Restore to a glob/pattern (e.g. `*.conf`).** Out of scope; the tree
|
|
||||||
picker requires explicit ticks. Power users can edit the URL or use the
|
|
||||||
CLI.
|
|
||||||
- **Bandwidth caps for restore.** Honoured automatically — restic's
|
|
||||||
`--limit-download` is part of `restic.Env` already (P2R-13) and applies
|
|
||||||
to restore unchanged.
|
|
||||||
- **Pre/post hooks for restore.** Hooks today gate only `kind=backup`
|
|
||||||
(P2R-11). Out of scope.
|
|
||||||
@@ -1,340 +0,0 @@
|
|||||||
# P4-03 / P4-04 — RBAC + User Management Design
|
|
||||||
|
|
||||||
> **Date:** 2026-05-05
|
|
||||||
> **Status:** brainstorm complete; ready for plan
|
|
||||||
> **Closes:** P4-03 (RBAC enforcement at API layer), P4-04 (User management UI)
|
|
||||||
|
|
||||||
## Goal
|
|
||||||
|
|
||||||
Enforce role-based access control at the HTTP layer (currently every authenticated user has admin powers) and ship the operator-facing screens for managing users, roles, and password lifecycle.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
Two coupled subsystems landing in one PR:
|
|
||||||
|
|
||||||
1. **RBAC enforcement** — chi route-group middleware that gates each subtree by minimum role. Fail-closed default (admin) so a forgotten declaration doesn't accidentally widen access.
|
|
||||||
2. **User management** — `/settings/users` sub-tab with list / add / edit / disable. Setup-link flow for new users (1-hour-expiry single-use token). Self-service password change at `/settings/account`.
|
|
||||||
|
|
||||||
The audit log already records actor + user_id on every mutation; new endpoints fold in naturally.
|
|
||||||
|
|
||||||
## Role taxonomy
|
|
||||||
|
|
||||||
Locked. Three roles, hierarchical (admin ⊇ operator ⊇ viewer):
|
|
||||||
|
|
||||||
| Action | admin | operator | viewer |
|
|
||||||
|---|:-:|:-:|:-:|
|
|
||||||
| View dashboard / alerts / audit / hosts | ✓ | ✓ | ✓ |
|
|
||||||
| Trigger Run-now / Restore / Snapshot diff | ✓ | ✓ | ✗ |
|
|
||||||
| Acknowledge / resolve alerts | ✓ | ✓ | ✗ |
|
|
||||||
| Edit schedules / source groups / retention / hooks | ✓ | ✓ | ✗ |
|
|
||||||
| Add / remove hosts (enrolment, accept/reject pending) | ✓ | ✓ | ✗ |
|
|
||||||
| Cancel running jobs | ✓ | ✓ | ✗ |
|
|
||||||
| Edit repo credentials | ✓ | ✓ | ✗ |
|
|
||||||
| Edit notification channels | ✓ | ✗ | ✗ |
|
|
||||||
| Manage users | ✓ | ✗ | ✗ |
|
|
||||||
| Self password change (`/settings/account`) | ✓ | ✓ | ✓ |
|
|
||||||
|
|
||||||
The role enum already exists in the schema (`CHECK (role IN ('admin','operator','viewer'))`) and in `internal/store/types.go`. Bootstrap creates the first user as admin. Zero migration needed for existing installs.
|
|
||||||
|
|
||||||
## Schema changes
|
|
||||||
|
|
||||||
All column-level ALTERs (CLAUDE.md prefers these over rebuilds; safe under `foreign_keys=ON`).
|
|
||||||
|
|
||||||
### Migration 0017 — `users` extensions
|
|
||||||
|
|
||||||
```sql
|
|
||||||
ALTER TABLE users ADD COLUMN email TEXT;
|
|
||||||
ALTER TABLE users ADD COLUMN disabled_at TEXT;
|
|
||||||
ALTER TABLE users ADD COLUMN must_change_password INTEGER NOT NULL DEFAULT 0;
|
|
||||||
|
|
||||||
-- Username case-insensitive lookup. Existing rows are kept as-is;
|
|
||||||
-- normalisation only applies to new INSERTs (handled in Go).
|
|
||||||
CREATE UNIQUE INDEX users_username_lower ON users(LOWER(username));
|
|
||||||
```
|
|
||||||
|
|
||||||
### Migration 0018 — `user_setup_tokens`
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE user_setup_tokens (
|
|
||||||
user_id TEXT PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
|
|
||||||
token_hash TEXT NOT NULL, -- sha256(raw_token), hex
|
|
||||||
expires_at TEXT NOT NULL,
|
|
||||||
created_at TEXT NOT NULL,
|
|
||||||
created_by TEXT NOT NULL REFERENCES users(id) ON DELETE SET NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX user_setup_tokens_expires ON user_setup_tokens(expires_at);
|
|
||||||
```
|
|
||||||
|
|
||||||
`user_id` is PRIMARY KEY, not just FOREIGN KEY — only one outstanding setup token per user. Regenerating supersedes the old via `INSERT OR REPLACE`.
|
|
||||||
|
|
||||||
## RBAC enforcement
|
|
||||||
|
|
||||||
### Middleware
|
|
||||||
|
|
||||||
```go
|
|
||||||
// requireRole returns chi middleware that 403s any request whose
|
|
||||||
// session-resolved user doesn't meet the minimum role. Roles are
|
|
||||||
// hierarchical: admin > operator > viewer.
|
|
||||||
func (s *Server) requireRole(min store.Role) func(http.Handler) http.Handler
|
|
||||||
```
|
|
||||||
|
|
||||||
Hierarchy implemented as a small helper:
|
|
||||||
|
|
||||||
```go
|
|
||||||
func roleAtLeast(have, min store.Role) bool {
|
|
||||||
rank := map[store.Role]int{
|
|
||||||
store.RoleViewer: 1,
|
|
||||||
store.RoleOperator: 2,
|
|
||||||
store.RoleAdmin: 3,
|
|
||||||
}
|
|
||||||
return rank[have] >= rank[min]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Route grouping in `server.go`
|
|
||||||
|
|
||||||
The existing `/api` and UI routes get re-grouped into three role bands plus a self-service group:
|
|
||||||
|
|
||||||
```
|
|
||||||
/api/* viewer-readable — GET endpoints anyone authenticated can hit
|
|
||||||
/api/* operator+ — mutating endpoints up to host/source-group/schedule level
|
|
||||||
/api/* admin-only — /api/users/*, channel CRUD
|
|
||||||
/api/account — self-service password change
|
|
||||||
|
|
||||||
/audit, /alerts, /hosts/{id}, etc. — viewer
|
|
||||||
/hosts/{id}/run, /alerts/{id}/ack — operator
|
|
||||||
/settings/users/*, /settings/notifications/* — admin
|
|
||||||
/settings/account — viewer (any authenticated)
|
|
||||||
```
|
|
||||||
|
|
||||||
Default at the bottom of `routes()` is admin (fail-closed). Any future endpoint that doesn't get explicitly placed lands in admin-only, surfacing the missing declaration as a permission error rather than a silent bypass.
|
|
||||||
|
|
||||||
### Per-handler nuance
|
|
||||||
|
|
||||||
One existing case warrants a handler-level check on top of the route gate: `GET /settings/users/{id}/edit` is admin-only, but the `PUT /api/account/password` is viewer-OK. The split-by-route already covers this; no per-handler overrides expected in v1.
|
|
||||||
|
|
||||||
### Out of scope of role middleware
|
|
||||||
|
|
||||||
- `/ws/agent` and `/api/agents/*` — agent bearer-token auth, separate chain
|
|
||||||
- `/healthz` — unauthenticated
|
|
||||||
- `/login`, `/logout`, `/bootstrap` — public
|
|
||||||
|
|
||||||
### 403 handling
|
|
||||||
|
|
||||||
- JSON endpoints: `{"error":"forbidden","code":"insufficient_role"}` with HTTP 403
|
|
||||||
- HTML endpoints: render a small "You don't have permission" panel inside the chrome (so the user keeps their nav and can move away), HTTP 403
|
|
||||||
- **No audit row on 403** — too noisy with normal users hitting URLs they don't have access to
|
|
||||||
|
|
||||||
### Session re-validation
|
|
||||||
|
|
||||||
Sessions need to honour `disabled_at` and current role on every request, not just at login. The session-validation middleware reads the user row each request (single PK lookup, fast in SQLite). If `disabled_at IS NOT NULL`, the session is invalidated and the request 401s. This makes "disable user" and "force logout" effectively immediate.
|
|
||||||
|
|
||||||
Cost: one SELECT per authenticated request. SQLite handles this comfortably for the fleet sizes this codebase targets.
|
|
||||||
|
|
||||||
## Setup-token flow (replacing temp passwords)
|
|
||||||
|
|
||||||
### Add user
|
|
||||||
|
|
||||||
1. Admin clicks **+ Add user** on `/settings/users`
|
|
||||||
2. Form: username (required, lowercase-normalised), email (optional, validated), role (admin/operator/viewer)
|
|
||||||
3. Server:
|
|
||||||
- Validates username uniqueness (case-insensitive). On collision with a *disabled* user, return a 409 with `{"existing_user_id": "...", "disabled": true}` so the UI can pivot to a "re-enable existing user" prompt
|
|
||||||
- On collision with an enabled user: 409 with a plain "username taken" error
|
|
||||||
- Creates user row with `password_hash = ""`, `must_change_password = 1`, `disabled_at = NULL`
|
|
||||||
- Generates 32 random bytes, hex-encodes → raw token (64 chars). Stores `sha256(token)` hex in `user_setup_tokens`. `expires_at = now + 1h`
|
|
||||||
- Audit: `user.created`, payload `{"username": "...", "role": "...", "with_setup_token": true}`
|
|
||||||
4. Server returns the admin to a one-time setup-link page: `/settings/users/{id}/setup-link`
|
|
||||||
- Shows the URL `http(s)://<base>/setup?token=<raw>` with a Copy button
|
|
||||||
- Countdown timer (live JS) showing time-to-expiry
|
|
||||||
- Warning: "This is the only time you'll see this link. If you lose it, regenerate from the user edit page."
|
|
||||||
- "Done" button → `/settings/users`
|
|
||||||
|
|
||||||
The raw token is **never persisted** server-side. Lost tokens require regeneration.
|
|
||||||
|
|
||||||
### Setup landing page (public, no auth required)
|
|
||||||
|
|
||||||
1. User clicks the link, lands on `/setup?token=<raw>`
|
|
||||||
2. Server hashes the token, looks up `user_setup_tokens` row, validates `expires_at > now`
|
|
||||||
3. On invalid / expired: render an error page with a "Contact your administrator" message. Audit: `user.setup_token.expired` (no actor).
|
|
||||||
4. On valid: render a password-set form: `new password + confirm`. Submit:
|
|
||||||
- Validates password meets policy (min 12 chars, no other constraints in v1 — same as bootstrap path)
|
|
||||||
- Hashes via `auth.HashPassword` (existing helper)
|
|
||||||
- Updates `users.password_hash`, sets `must_change_password = 0`
|
|
||||||
- Deletes the `user_setup_tokens` row (single-use)
|
|
||||||
- Logs the user in via the existing session helper
|
|
||||||
- Audit: `user.setup_completed`, payload `{"user_id": "..."}`
|
|
||||||
- Redirect to `/`
|
|
||||||
|
|
||||||
### Regenerate setup link (admin)
|
|
||||||
|
|
||||||
`/settings/users/{id}/edit` shows a "Regenerate setup link" button when `must_change_password = 1`. Clicking it:
|
|
||||||
|
|
||||||
1. Generates a new token + hash, INSERT OR REPLACE on `user_setup_tokens`
|
|
||||||
2. Returns the admin to the same one-time link page as the add-user flow
|
|
||||||
3. Audit: `user.setup_token.regenerated`
|
|
||||||
|
|
||||||
### Cleanup
|
|
||||||
|
|
||||||
Expired tokens linger in the DB until cleaned. Add a cheap sweep on the existing maintenance ticker: `DELETE FROM user_setup_tokens WHERE expires_at < ?`. Runs at the same cadence as the alert engine tick (60s). No new ticker needed.
|
|
||||||
|
|
||||||
## Self-service password change
|
|
||||||
|
|
||||||
`/settings/account`
|
|
||||||
|
|
||||||
- Accessible to every authenticated user (any role)
|
|
||||||
- Form: `current password + new password + confirm`
|
|
||||||
- Server validates current password (re-uses login bcrypt comparison), updates hash, audits `user.password_changed`
|
|
||||||
- Special case: if `must_change_password = 1`, the current-password field is hidden / not required (covers the legacy "admin reset password" path if we ever add one — current setup-token path doesn't use this)
|
|
||||||
|
|
||||||
The bootstrap user's password change uses this same page (no special case for "first admin").
|
|
||||||
|
|
||||||
## User list / management UI
|
|
||||||
|
|
||||||
### `/settings/users` (admin-only)
|
|
||||||
|
|
||||||
```
|
|
||||||
Settings · Users [3]
|
|
||||||
─────────────────────────────────────────────────
|
|
||||||
[ + Add user ] [ ] Show disabled
|
|
||||||
|
|
||||||
USERNAME EMAIL ROLE LAST LOGIN STATUS
|
|
||||||
alice alice@example.com admin 2 mins ago enabled
|
|
||||||
bob — operator 3 days ago enabled
|
|
||||||
charlie c@example.com viewer never setup pending ← if has open setup token
|
|
||||||
diane d@example.com operator 1 month ago disabled ← only when "Show disabled"
|
|
||||||
|
|
||||||
Actions per row: Edit · (Re-enable | Disable)
|
|
||||||
```
|
|
||||||
|
|
||||||
- "setup pending" badge for users with `must_change_password=1` — clicking the row goes to edit, which surfaces the regenerate-link button prominently
|
|
||||||
- "Show disabled" is a checkbox querystring filter (`?show_disabled=1`)
|
|
||||||
- Sort columns: clickable like the audit log (username, role, last_login). Reuse the same pattern (server-side sort + URL builder + glyph)
|
|
||||||
|
|
||||||
### `/settings/users/new` (admin-only)
|
|
||||||
|
|
||||||
Single form: `username + email (optional) + role`. On submit → either landed on the setup-link page (success) or returned with an inline "username exists, re-enable existing?" panel (collision with disabled user) / red error (collision with enabled user).
|
|
||||||
|
|
||||||
### `/settings/users/{id}/edit` (admin-only)
|
|
||||||
|
|
||||||
- Display-only block: id, created_at, last_login_at, status
|
|
||||||
- **Editable**: email, role
|
|
||||||
- **Buttons**:
|
|
||||||
- "Regenerate setup link" — only when `must_change_password = 1`
|
|
||||||
- "Disable user" — flips `disabled_at`; rejected if last enabled admin (server-side check). Confirmation modal with typed name to confirm.
|
|
||||||
- "Re-enable user" — clears `disabled_at`. No confirmation.
|
|
||||||
- "Force logout" — separate from disable; just kills the session but keeps the user enabled. Useful for "I think Bob's session was hijacked" without locking him out.
|
|
||||||
- Cancel / Save buttons at the bottom
|
|
||||||
|
|
||||||
### `/settings/users/{id}/setup-link` (admin-only)
|
|
||||||
|
|
||||||
Renders the one-time link with copy button + countdown. Shown after add-user and after regenerate. Reload of this URL after the token is consumed: 410 Gone with a clear message.
|
|
||||||
|
|
||||||
### `/settings/account` (any authenticated)
|
|
||||||
|
|
||||||
Self-service password change. Form-only page; no nav under Settings since most users will only see this one Settings page in v1.
|
|
||||||
|
|
||||||
## API surface
|
|
||||||
|
|
||||||
```
|
|
||||||
GET /api/users admin — list (with ?show_disabled=1 filter)
|
|
||||||
POST /api/users admin — create user, returns user_id + setup_url
|
|
||||||
GET /api/users/{id} admin — read
|
|
||||||
PATCH /api/users/{id} admin — update email, role
|
|
||||||
POST /api/users/{id}/disable admin — set disabled_at; rejects last-admin
|
|
||||||
POST /api/users/{id}/enable admin — clear disabled_at
|
|
||||||
POST /api/users/{id}/regenerate-setup admin — new token, returns setup_url
|
|
||||||
POST /api/users/{id}/force-logout admin — kill all sessions for this user
|
|
||||||
|
|
||||||
POST /api/account/password any auth — self password change
|
|
||||||
GET /setup public — landing page (HTML form)
|
|
||||||
POST /setup public — submit new password
|
|
||||||
```
|
|
||||||
|
|
||||||
UI routes mirror the API but at `/settings/users/...`.
|
|
||||||
|
|
||||||
## Last-admin self-protection
|
|
||||||
|
|
||||||
Two operations that could lock everyone out are guarded:
|
|
||||||
|
|
||||||
- **Disable user**: rejected if the user is admin AND there are no other enabled admins
|
|
||||||
- **Demote admin to operator/viewer**: same check
|
|
||||||
|
|
||||||
Server-side enforcement (single SELECT on `COUNT(*) FROM users WHERE role='admin' AND disabled_at IS NULL`). UI hint: edit page disables the role dropdown's non-admin options + disable button when the user is the last admin, with a tooltip explaining why.
|
|
||||||
|
|
||||||
The bootstrap admin is just a regular admin row; this check covers it.
|
|
||||||
|
|
||||||
## Audit actions
|
|
||||||
|
|
||||||
New action strings introduced:
|
|
||||||
|
|
||||||
- `user.created`
|
|
||||||
- `user.updated` (email / role change)
|
|
||||||
- `user.disabled`
|
|
||||||
- `user.enabled`
|
|
||||||
- `user.password_changed`
|
|
||||||
- `user.setup_completed`
|
|
||||||
- `user.setup_token.regenerated`
|
|
||||||
- `user.setup_token.expired` (system-driven, on cleanup sweep)
|
|
||||||
- `user.force_logout`
|
|
||||||
|
|
||||||
All target_kind = `user`, target_id = the affected user's id. Existing payload conventions apply.
|
|
||||||
|
|
||||||
## Ordering / dependencies
|
|
||||||
|
|
||||||
Slices in approximate landing order (writing-plans will firm this up):
|
|
||||||
|
|
||||||
1. **A. Schema** — migrations 0017 + 0018, `Role` helper updates, store API extensions (email, disabled_at, must_change_password, setup_token CRUD, lowercase username constraints)
|
|
||||||
2. **B. RBAC middleware** — `requireRole` + `roleAtLeast`, route re-grouping in server.go, 403 rendering for HTML + JSON
|
|
||||||
3. **C. Session re-validation** — extend the existing session middleware to re-read user state per request, kick disabled users
|
|
||||||
4. **D. Setup-token flow** — `/setup` GET+POST, the one-time link page after add-user
|
|
||||||
5. **E. User CRUD API** — handlers + handlers' tests
|
|
||||||
6. **F. UI** — `/settings/users` list, add, edit, setup-link page, account page
|
|
||||||
7. **G. Sweep** — Playwright walk through the full lifecycle (add → setup link → user signs in → admin disables → user gets kicked → admin re-enables → user signs back in)
|
|
||||||
|
|
||||||
Each slice can land as its own commit on the branch. RBAC middleware (B) goes in *before* user CRUD so we don't ship an open `/api/users/*` even briefly.
|
|
||||||
|
|
||||||
## Test strategy
|
|
||||||
|
|
||||||
- **Store**: `Set/GetSetupToken`, `EnableUser`/`DisableUser`, last-admin guard, lowercase-username uniqueness, expired-token cleanup
|
|
||||||
- **HTTP middleware**: `roleAtLeast` truth table; viewer hitting an operator route returns 403; disabled user gets 401 mid-session
|
|
||||||
- **Setup flow integration**: create user → fetch setup URL → land on `/setup?token=...` → POST password → user can log in → token row gone
|
|
||||||
- **UI**: existing Playwright sweep pattern, screenshots into `_diag/p4-03-04-sweep/`
|
|
||||||
|
|
||||||
## Out of scope (deferred)
|
|
||||||
|
|
||||||
- **OIDC** (P4-05) — adds a parallel auth chain. This PR keeps the surface for it (role taxonomy, session middleware) but doesn't wire it.
|
|
||||||
- **Email-the-setup-link** — explicitly deferred. Easy follow-up because the SMTP channel client from P3-06 is already there.
|
|
||||||
- **Hard delete** — disable-only in v1; can add a typed-confirm "purge" later if it turns out to be needed.
|
|
||||||
- **Password complexity / rotation policy** — current minimum (12 chars) and no rotation; tighten later if/when policy demands.
|
|
||||||
- **Lockout on failed login** — a brute-force protection layer is its own task and orthogonal to RBAC.
|
|
||||||
- **Audit on 403** — not in v1; revisit if compliance asks for it.
|
|
||||||
|
|
||||||
## Risks / gotchas to watch
|
|
||||||
|
|
||||||
- **Existing tests** that assume "any logged-in user can hit any endpoint" will break. Audit the test fixtures: most use `loginAsAdmin`, which is fine; any tests currently exercising specific operator/viewer paths need explicit role assignment. (Quick grep suggests there aren't many — bootstrap-only.)
|
|
||||||
- **Bootstrap user normalisation** — the existing admin row's username is whatever it was set to at first run. The new lowercase-uniqueness index uses `LOWER(username)`, which makes the existing row implicitly lowercase-keyed for lookups. No data migration needed.
|
|
||||||
- **Session middleware re-read cost** — one SELECT per authenticated request. SQLite WAL handles this fine at expected fleet sizes; if it ever shows up on a profile we add a small in-memory cache keyed by session id with a 30s TTL.
|
|
||||||
- **403 vs 401 distinction** — make sure unauthenticated requests still get 401 (login redirect) and authenticated-but-insufficient get 403. The middleware should compose: auth-required first, role-required second.
|
|
||||||
|
|
||||||
## Acceptance
|
|
||||||
|
|
||||||
- [ ] An admin can add a user, copy the setup link, the new user can land on `/setup?token=...`, set a password, and reach `/`
|
|
||||||
- [ ] An expired token (>1h) on `/setup?token=...` shows the "contact your administrator" page
|
|
||||||
- [ ] Admin regenerates the link, old token is invalid, new token works
|
|
||||||
- [ ] Operator user can trigger Run-now but cannot reach `/settings/users` (403) and the Users tab in Settings is hidden in their nav
|
|
||||||
- [ ] Viewer user gets 403 on Run-now, 200 on dashboard / alerts / audit
|
|
||||||
- [ ] Admin disables a user mid-session — the user's next request is 401 and they're redirected to login
|
|
||||||
- [ ] Admin cannot disable themselves if they are the last enabled admin (server returns 409, UI button is greyed)
|
|
||||||
- [ ] Self-service password change at `/settings/account` works for every role
|
|
||||||
- [ ] All existing tests pass; new test suite covers role middleware, setup-token lifecycle, last-admin guard
|
|
||||||
|
|
||||||
## Self-review notes
|
|
||||||
|
|
||||||
- ✅ All sections concrete, no TBD / TODO
|
|
||||||
- ✅ Schema migrations are column-level (CLAUDE.md compliance)
|
|
||||||
- ✅ Audit action vocabulary listed in one place; no string typos to drift
|
|
||||||
- ✅ Out-of-scope list explicit so reviewers can challenge what we *aren't* doing
|
|
||||||
- ✅ Last-admin guard handled both server-side and UI-hinted
|
|
||||||
- ✅ Token storage hashes the secret server-side; raw is shown to admin once and never again
|
|
||||||
- ✅ Session re-validation cost noted with a fallback if it shows up on a profile
|
|
||||||
@@ -1,215 +0,0 @@
|
|||||||
# P4-05 — OIDC Login Design
|
|
||||||
|
|
||||||
> **Date:** 2026-05-05
|
|
||||||
> **Status:** brainstorm complete; ready for plan
|
|
||||||
> **Closes:** P4-05 (OIDC login)
|
|
||||||
|
|
||||||
## Goal
|
|
||||||
|
|
||||||
Wire OpenID Connect authentication as a sign-in path alongside the existing local-user system, so a deployment that already has an IdP (Authelia, Authentik, Keycloak, Okta, Auth0, etc.) can use it for restic-manager logins.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
OIDC sits on top of the local-user system rather than replacing it. The first time a user signs in via OIDC the server **just-in-time provisions** a local user row marked `auth_source='oidc'`, with role derived from the IdP's `roles` claim. Subsequent sign-ins look up the same row by stable `oidc_subject` and refresh role + email from the latest claims. Once the row exists it behaves like any other local user — admin can disable it, force-logout, see it in audit logs, etc. — except password-login is rejected because there's no password.
|
|
||||||
|
|
||||||
The Authorization Code flow (with PKCE) is implemented against the discovered well-known config of a single configured issuer. Front-channel logout: clicking Sign out drops the local session + redirects the browser to the IdP's `end_session_endpoint` (when advertised). Back-channel logout deferred.
|
|
||||||
|
|
||||||
## Locked decisions
|
|
||||||
|
|
||||||
| Decision | Pick |
|
|
||||||
|---|---|
|
|
||||||
| User lifecycle | **B** — JIT-provision local rows on first OIDC login (`auth_source='oidc'`, `oidc_subject`) |
|
|
||||||
| Role mapping config | **A** — YAML/env, claim name configurable (default `groups`, matching Authelia / Keycloak / Authentik), default = deny on no-match |
|
|
||||||
| Username source | `preferred_username`, fallback to `email` |
|
|
||||||
| Username collision with existing local user | **Refuse** with clear remediation message |
|
|
||||||
| Provider config | **Single provider** — `providers:` array can come later |
|
|
||||||
| Login page layout | SSO button **above** password form; password form labelled "or sign in with a local account" |
|
|
||||||
| OIDC users + password login | **Disabled** — `auth_source='oidc'` rows have empty `password_hash`; password form rejects them |
|
|
||||||
| Logout shape | **Front-channel only** — drop session + redirect to `end_session_endpoint` when advertised |
|
|
||||||
| Role re-evaluation | **At login only** — claims read at the OIDC callback; admin can disable mid-session locally |
|
|
||||||
|
|
||||||
## Schema changes
|
|
||||||
|
|
||||||
Migration 0019 — `users` extensions for OIDC bookkeeping:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
ALTER TABLE users ADD COLUMN auth_source TEXT NOT NULL DEFAULT 'local'
|
|
||||||
CHECK (auth_source IN ('local', 'oidc'));
|
|
||||||
ALTER TABLE users ADD COLUMN oidc_subject TEXT;
|
|
||||||
|
|
||||||
CREATE UNIQUE INDEX users_oidc_subject ON users(oidc_subject)
|
|
||||||
WHERE oidc_subject IS NOT NULL;
|
|
||||||
```
|
|
||||||
|
|
||||||
Both column-level ALTERs (CLAUDE.md preference). The unique partial index defends the JIT-lookup invariant (one row per IdP subject) without blocking multiple rows with NULL oidc_subject (the local users).
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# server config — extend existing config struct
|
|
||||||
oidc:
|
|
||||||
issuer: https://auth.example.com # well-known config discovered from this
|
|
||||||
client_id: restic-manager
|
|
||||||
client_secret: ${RM_OIDC_CLIENT_SECRET} # or via _FILE
|
|
||||||
display_name: Authelia # button label "Sign in with <display_name>"; default "SSO"
|
|
||||||
scopes: [openid, profile, email, groups]
|
|
||||||
role_claim: groups # default if absent (matches Authelia / Keycloak / Authentik)
|
|
||||||
role_mapping:
|
|
||||||
rm-admins: admin
|
|
||||||
rm-operators: operator
|
|
||||||
rm-viewers: viewer
|
|
||||||
# Optional — auto-derived from BaseURL if absent.
|
|
||||||
redirect_url: https://rm.example.com/auth/oidc/callback
|
|
||||||
```
|
|
||||||
|
|
||||||
Env-var overrides: `RM_OIDC_ISSUER`, `RM_OIDC_CLIENT_ID`, `RM_OIDC_CLIENT_SECRET`, `RM_OIDC_CLIENT_SECRET_FILE`. Mapping is YAML-only (env doesn't fit a multi-key string→string map cleanly).
|
|
||||||
|
|
||||||
When `oidc.issuer` is empty or missing, OIDC is disabled (current behaviour). No restart-toggle UI; this is a deploy-time setting.
|
|
||||||
|
|
||||||
## Auth flow
|
|
||||||
|
|
||||||
### Login start
|
|
||||||
|
|
||||||
`GET /auth/oidc/login` — only mounted when OIDC is configured.
|
|
||||||
|
|
||||||
1. Generate `state` (32 random bytes, base64) and `code_verifier` (64 random bytes, base64); compute `code_challenge = base64(sha256(code_verifier))`.
|
|
||||||
2. Store `(state, code_verifier, created_at)` in a new ephemeral table (or in memory with a 5-minute TTL — see "trade-off" below).
|
|
||||||
3. Redirect to `<authorization_endpoint>?response_type=code&client_id=...&redirect_uri=...&scope=...&state=...&code_challenge=...&code_challenge_method=S256`.
|
|
||||||
|
|
||||||
### Callback
|
|
||||||
|
|
||||||
`GET /auth/oidc/callback?code=...&state=...` — also OIDC-only mount.
|
|
||||||
|
|
||||||
1. Validate `state` against the stored value (one-shot — delete row on read). Reject if missing/expired/already used.
|
|
||||||
2. Exchange `code` + `code_verifier` for tokens at `token_endpoint`.
|
|
||||||
3. Validate the `id_token` JWT: signature against the JWKS endpoint, `iss`, `aud`, `exp`, `iat`, `nonce` (if used).
|
|
||||||
4. Extract `sub`, `preferred_username`, `email`, and the configured `role_claim` (default `roles`).
|
|
||||||
5. Pick username: `preferred_username` if non-empty, else `email`. Lowercase / trim per the existing local-user rules.
|
|
||||||
6. Pick role: first match in `role_mapping` against the array of role-claim values. **No match → deny with a clear error page**, no row created.
|
|
||||||
7. Look up user by `oidc_subject`. Three cases:
|
|
||||||
- **Found** — refresh `email`, `role`, `last_login_at`. Don't touch `username` (changing it would break audit trails; if the IdP changes the username, that's an operator concern). Log `user.oidc_login`.
|
|
||||||
- **Not found, username free** — INSERT row with `auth_source='oidc'`, `oidc_subject=<sub>`, `password_hash=''`, `must_change_password=0`. Log `user.created` with payload `{"auth_source":"oidc"}` + `user.oidc_login`.
|
|
||||||
- **Not found, username taken by a local user** — render an error page: "This OIDC user (`<sub>`) wants to sign in as `alice`, but a local user with that name already exists. Ask your administrator to either rename / remove the local user, or exclude this user from the OIDC mapping." 403, no row created. Log `user.oidc_login_blocked`.
|
|
||||||
8. Drop a session cookie + `MarkUserLogin` (the existing helper).
|
|
||||||
9. Redirect to `/`.
|
|
||||||
|
|
||||||
### Logout
|
|
||||||
|
|
||||||
`POST /logout` (existing handler) — augmented:
|
|
||||||
|
|
||||||
1. Look up the session before deletion (we need the user row to know if they're an OIDC user).
|
|
||||||
2. Delete the session as today.
|
|
||||||
3. If the user is `auth_source='oidc'` AND the discovered `end_session_endpoint` is non-empty → 303 to `<end_session_endpoint>?id_token_hint=<id_token>&post_logout_redirect_uri=<base>/login`. Otherwise → existing 303 to `/login`.
|
|
||||||
|
|
||||||
We need to keep the latest `id_token` per session to drive `id_token_hint`. Stash it in a new `sessions.id_token TEXT` column (one column-level ALTER on migration 0019 alongside the user columns), populated only for OIDC sessions.
|
|
||||||
|
|
||||||
## State table
|
|
||||||
|
|
||||||
Two reasonable shapes for the short-lived state used during the OAuth round-trip:
|
|
||||||
|
|
||||||
- **In-memory map** with a 5-minute TTL sweeper. Simpler, but multi-process deployments lose it (no multi-process today, but Phase 5 OSS readiness might add).
|
|
||||||
- **`oidc_state` table** — `(state_hash PK, code_verifier, created_at)`, swept on the same 60s alert-engine tick that already handles setup-token cleanup.
|
|
||||||
|
|
||||||
I'll go with the **table**. Costs ~3 lines in the existing cleanup tick, behaves correctly under restarts, and survives a future scale-out. Migration 0019 includes:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE oidc_state (
|
|
||||||
state_hash TEXT PRIMARY KEY, -- sha256(state) hex; raw state never persisted
|
|
||||||
code_verifier TEXT NOT NULL,
|
|
||||||
created_at TEXT NOT NULL
|
|
||||||
);
|
|
||||||
CREATE INDEX oidc_state_created ON oidc_state(created_at);
|
|
||||||
```
|
|
||||||
|
|
||||||
## Login-page UI
|
|
||||||
|
|
||||||
`/login` template branches based on `view.OIDCEnabled`:
|
|
||||||
|
|
||||||
- **OIDC off** → current layout (just the password form).
|
|
||||||
- **OIDC on** → an `Sign in with <provider name>` button at the top, then a faint divider line, then the existing password form labelled "Or sign in with a local account". Provider name comes from a new optional config `oidc.display_name` (defaults to "SSO").
|
|
||||||
|
|
||||||
Failed-OIDC redirects (no role match, username collision, IdP error) land on `/login?oidc_error=<reason>` with a small banner above the buttons.
|
|
||||||
|
|
||||||
## Audit actions
|
|
||||||
|
|
||||||
New entries in the action vocabulary:
|
|
||||||
|
|
||||||
- `user.oidc_login` (target_kind=user, target_id=user_id, payload `{"sub":"…"}`)
|
|
||||||
- `user.oidc_login_blocked` (target_kind=user, target_id=oidc_subject when no row was created, payload `{"username":"…", "reason":"username_taken|no_role_match|other"}`)
|
|
||||||
- `user.created` already exists; OIDC's first-time provisioning fires this with payload `{"auth_source":"oidc"}` so the audit log distinguishes admin-created from JIT-provisioned rows.
|
|
||||||
|
|
||||||
## User-management UI changes
|
|
||||||
|
|
||||||
Small additions, not new screens:
|
|
||||||
|
|
||||||
- **Users list** — Status column adds a small `oidc` chip when `auth_source='oidc'` so admin can see at a glance which rows came from JIT-provisioning. Sortable by auth_source via the same sortable-headers pattern (lands as a small follow-up if anyone asks; out of scope for v1).
|
|
||||||
- **Add user form** — disabled when OIDC is the only auth path, with a hint: "User provisioning is handled by your OIDC provider; users appear here on first sign-in." Configurable later via a `oidc.disable_local_users` flag if that becomes a real ask. Out of scope for v1; both paths stay open.
|
|
||||||
- **Edit user form** — when `auth_source='oidc'`:
|
|
||||||
- Username field disabled (changing it would just be undone on next OIDC login)
|
|
||||||
- Role dropdown disabled, with a hint: "Role is managed by your OIDC provider's `roles` claim mapping. Edit the mapping in server config to change."
|
|
||||||
- Email field disabled (refreshed from IdP on each login)
|
|
||||||
- **Disable / Enable / Force logout** still work — disabling an OIDC user kicks their session and rejects future OIDC logins ("user disabled by administrator")
|
|
||||||
- **Regenerate setup link** hidden — there's no setup token for OIDC users
|
|
||||||
- **Login UI** — password form rejects users with `auth_source='oidc'` ("This account uses single sign-on. Click the SSO button above.")
|
|
||||||
|
|
||||||
## Middleware / handler changes
|
|
||||||
|
|
||||||
- **Routes**: new public-band entries `GET /auth/oidc/login`, `GET /auth/oidc/callback`. Skipped entirely when OIDC isn't configured (`s.deps.OIDC == nil`).
|
|
||||||
- **Logout handler** augmented to fetch the user row + decide between local logout (303 → `/login`) and OIDC logout (303 → `end_session_endpoint`).
|
|
||||||
- **Login handler** rejects `auth_source='oidc'` users with the SSO-prompt error.
|
|
||||||
- **Last-admin guard** — already covers OIDC users naturally because they live in the `users` table. The role-from-claims path could create a "every admin gets demoted to operator" situation if the IdP's claim mapping is wrong; the guard rejects that demotion at the moment it'd be applied (returns the user to the login page with `oidc_error=role_change_blocked` and audit entry; admin must fix the mapping or promote a local admin first).
|
|
||||||
|
|
||||||
## Implementation outline
|
|
||||||
|
|
||||||
1. **Schema** — migration 0019 (users.auth_source + oidc_subject, sessions.id_token, oidc_state table)
|
|
||||||
2. **Config** — extend `internal/server/config` with the OIDC block + env-var overrides; load JWKS lazily
|
|
||||||
3. **Discovery + JWKS** — small helper that fetches `<issuer>/.well-known/openid-configuration` once at startup, caches `authorization_endpoint`, `token_endpoint`, `end_session_endpoint`, `jwks_uri`. JWKS refreshed on first failed verification.
|
|
||||||
4. **Login start handler** — `/auth/oidc/login`
|
|
||||||
5. **Callback handler** — `/auth/oidc/callback`, with the four claim-resolution branches
|
|
||||||
6. **Logout handler augmentation** — branch on `auth_source`
|
|
||||||
7. **Login form rejection** — local-user password form rejects OIDC accounts
|
|
||||||
8. **State cleanup** — extend the alert engine's existing cleanup tick
|
|
||||||
9. **UI** — `oidc` chip on users list, disabled fields on edit-form for OIDC users, login page SSO button + error banner
|
|
||||||
10. **Tests** — config parse tests; happy-path callback test using a fake IdP (httptest server with a hand-rolled discovery doc + JWKS); username-collision test; no-role-match test; logout test
|
|
||||||
11. **Sweep** — full Playwright walk against an actual IdP (Authelia in a Docker container) — admin gets in via OIDC, role mapping works, logout redirects through IdP, OIDC user can't password-login
|
|
||||||
|
|
||||||
## Test strategy
|
|
||||||
|
|
||||||
The IdP is the hard part to test cleanly. Two layers:
|
|
||||||
|
|
||||||
- **Unit / integration tests** use a stub OIDC provider built into the test harness — `httptest.Server` exposing `.well-known/openid-configuration`, a token endpoint that signs minted JWTs with a test ECDSA key, and a JWKS endpoint serving the public key. This covers every code path without a real IdP. Pattern: each test mints its own claims and runs the callback against the stub.
|
|
||||||
- **Smoke env** runs against a real Authelia container (existing `compose.smoke.yaml`-style file or one-liner `docker run`) for the final sweep — confirms the discovery doc isn't being misread, real JWT verification works, real `end_session_endpoint` redirect works.
|
|
||||||
|
|
||||||
## Out of scope (deferred)
|
|
||||||
|
|
||||||
- **Multi-provider** support (`providers:` array)
|
|
||||||
- **Back-channel logout** (RFC 8138) — schema isn't blocked from adding it later
|
|
||||||
- **UI-driven role mapping** (config-only in v1)
|
|
||||||
- **Refresh tokens / mid-session role re-evaluation** — login-only refresh in v1
|
|
||||||
- **`oidc.disable_local_users`** flag — both paths stay open in v1
|
|
||||||
- **OIDC user dashboard chip / badges** beyond the small `oidc` indicator on the users list
|
|
||||||
- **Per-user "auth source" filter on the users list** — sortable headers cover most of the use case
|
|
||||||
|
|
||||||
## Risks / gotchas
|
|
||||||
|
|
||||||
- **JWKS key rotation** — refresh on first failed verification is the standard fix; document the cache TTL (1h) in the config block.
|
|
||||||
- **Clock skew** — accept `iat`/`exp` with a 60s leeway; matches what most OIDC libraries do.
|
|
||||||
- **End-session 404 / not advertised** — degrade gracefully; just drop the session and 303 to `/login`. Don't 500 the logout because the IdP doesn't implement RP-initiated logout.
|
|
||||||
- **Username changes at the IdP** — silently keep the local username (matches our locked decision: subject is the stable key, username is display-only). Document.
|
|
||||||
- **Role claim is sometimes a string, sometimes an array, sometimes a comma-separated string** depending on IdP — normalise into `[]string` before mapping. Authelia/Keycloak emit arrays; some custom setups emit strings; handle both.
|
|
||||||
- **Authelia `sub` is an opaque UUID, not the username** (Authelia 4.39+ default for new clients). Don't assume `sub` is human-readable; it's stable but display value is `preferred_username` or `email`. The locked design already keys lookups on `sub` and uses `preferred_username` for the display username, so this is just a correctness note.
|
|
||||||
- **`end_session_endpoint` may not be published** (Authelia doesn't advertise it for many configs). The locked logout flow already degrades to "drop session + redirect to /login" when the discovery doc lacks it; no extra config needed.
|
|
||||||
- **Password-form bypass for OIDC users via /api/auth/login (JSON)** — same rejection rule applies, not just the HTML form.
|
|
||||||
|
|
||||||
## Acceptance
|
|
||||||
|
|
||||||
- [ ] An OIDC user with `roles: ["rm-admins"]` can sign in, becomes an admin, is visible in `/settings/users` with an `oidc` chip
|
|
||||||
- [ ] Same user signing in again resolves to the same row (no duplicate)
|
|
||||||
- [ ] Same user with `roles: ["something-else"]` is denied, lands on `/login?oidc_error=no_role_match` with a banner, no row created
|
|
||||||
- [ ] OIDC user can't password-login through `/login` or `/api/auth/login`
|
|
||||||
- [ ] Admin disables an OIDC user → next OIDC login is rejected, existing session bounced (existing disable-mid-session)
|
|
||||||
- [ ] Sign out as an OIDC user → 303 to IdP's end-session URL (when advertised); no end-session URL → 303 to `/login`
|
|
||||||
- [ ] OIDC config absent → password login works exactly as today (zero behavioural change)
|
|
||||||
- [ ] Username collision: a local `alice` exists, OIDC user with `preferred_username=alice` and a different `sub` → blocked at sign-in with the clear error page
|
|
||||||
- [ ] Last-admin guard refuses to demote the only enabled admin even if the IdP's role mapping says otherwise
|
|
||||||
- [ ] All existing tests pass; new test suite covers the four claim-resolution branches and logout
|
|
||||||
@@ -1,229 +0,0 @@
|
|||||||
# P5-03 — Docker-only release path
|
|
||||||
|
|
||||||
**Status:** approved 2026-05-05. Pivots P5-03 away from `goreleaser` +
|
|
||||||
binary archives toward a single Docker image as the only public
|
|
||||||
deliverable.
|
|
||||||
|
|
||||||
## Goal
|
|
||||||
|
|
||||||
One artifact per tag: the `restic-manager` server image, multi-arch
|
|
||||||
(linux amd64 + arm64), published to the Gitea container registry of
|
|
||||||
this self-hosted instance. The image bakes in cross-compiled agent
|
|
||||||
binaries (linux amd64, linux arm64, windows amd64), the install
|
|
||||||
scripts, and the systemd unit at a read-only image path. The running
|
|
||||||
server distributes those agents and scripts via its existing
|
|
||||||
`/agent/binary` and `/install/*` endpoints; operators on N hosts never
|
|
||||||
download a release artifact directly.
|
|
||||||
|
|
||||||
Source builds via `make build` remain a first-class path for anyone
|
|
||||||
who wants binaries.
|
|
||||||
|
|
||||||
## Non-goals
|
|
||||||
|
|
||||||
- Standalone binary archives (`.tar.gz`, `.zip`) on the release page.
|
|
||||||
- darwin / windows-arm64 agent targets — neither is service-tested.
|
|
||||||
- `goreleaser`. Not used.
|
|
||||||
- `cosign`, `SBOM`, `in-toto`, `minisign`. Re-promote when we ship
|
|
||||||
binaries outside an image (Phase 6 candidate).
|
|
||||||
- GHCR / GitHub mirror. Single source of truth = Gitea.
|
|
||||||
|
|
||||||
## Decisions captured (with one-line rationale)
|
|
||||||
|
|
||||||
| ID | Decision | Why |
|
|
||||||
|----|----------|-----|
|
|
||||||
| D1 | One artifact: server Docker image | Architecture already routes agent distribution through the server (`/agent/binary`); release surface should mirror that. |
|
|
||||||
| D2 | Trigger: `tag-push` (`v*.*.*`) **plus** `workflow_dispatch` | Tag for real cuts; dispatch for snapshot iteration without polluting tag history. |
|
|
||||||
| D3 | Build matrix: linux amd64+arm64 server image; agent cross-compiles for linux amd64+arm64+windows amd64 | Mirrors the existing CI build matrix; nothing ships that hasn't been service-tested. |
|
|
||||||
| D4 | Image-baked, separate path (`/opt/restic-manager/dist/`); HTTP handler reads `<DataDir>/...` first, falls back to `/opt/...` | Volume stays purely operator state; image content is immutable per tag; eliminates the smoke-env "stale agent" footgun in production. |
|
|
||||||
| D5 | Tag fan-out: `vX.Y.Z`, `X.Y`, `X`, `latest` — but `latest` is held back until `v1.0.0` | Standard rolling-minor pattern; pre-1.0 forces explicit pinning. |
|
|
||||||
| D6 | Snapshot tag: `:snapshot-<shortsha>`, never moves `latest` | Operator can never accidentally pull an unblessed build. |
|
|
||||||
| D7 | Version embedding via `-ldflags`: `main.version`, `main.commit`, `main.date` on both `cmd/server` and `cmd/agent` | Server already had `version`; add `commit`/`date` to both for parity and traceability. |
|
|
||||||
| D8 | Registry: Gitea container registry on this instance, under `<host>/<owner>/restic-manager` | One source of truth, no external creds. |
|
|
||||||
| D9 | Integrity: a `SHA256SUMS` file + the manifest digest in the release notes; nothing else | Image is the unit of trust; pull-by-digest is the verification primitive. |
|
|
||||||
| D10 | P1-31 (signed binaries) stays deferred | Re-promote the day we ship binaries outside an image. |
|
|
||||||
|
|
||||||
## Image layout
|
|
||||||
|
|
||||||
Multi-stage Dockerfile (extends today's `deploy/Dockerfile.server`):
|
|
||||||
|
|
||||||
```
|
|
||||||
build stage (golang:1.25-alpine):
|
|
||||||
cross-compile cmd/server for $TARGETARCH (linux)
|
|
||||||
cross-compile cmd/agent for linux/amd64
|
|
||||||
cross-compile cmd/agent for linux/arm64
|
|
||||||
cross-compile cmd/agent for windows/amd64
|
|
||||||
(CGO_ENABLED=0 throughout — pure-Go SQLite)
|
|
||||||
|
|
||||||
final stage (gcr.io/distroless/static-debian12:nonroot):
|
|
||||||
/usr/local/bin/restic-manager-server (matches image arch)
|
|
||||||
/opt/restic-manager/dist/agent-binaries/
|
|
||||||
restic-manager-agent-linux-amd64
|
|
||||||
restic-manager-agent-linux-arm64
|
|
||||||
restic-manager-agent-windows-amd64.exe
|
|
||||||
/opt/restic-manager/dist/install/
|
|
||||||
install.sh
|
|
||||||
install.ps1
|
|
||||||
restic-manager-agent.service
|
|
||||||
```
|
|
||||||
|
|
||||||
`/opt/restic-manager/dist/` is owned by `root:root`, mode `0755` for
|
|
||||||
directories, `0755` for `install.sh` (script must be executable when
|
|
||||||
the install path uses `curl ... | sh` semantics) and `0644` for the
|
|
||||||
unit file and `install.ps1`. The agent binaries are mode `0755`.
|
|
||||||
|
|
||||||
`<DataDir>` keeps holding only operator state: `restic-manager.db`,
|
|
||||||
`secret.key`, `secrets.enc`, `audit/`, `tls/`. Nothing the image
|
|
||||||
owns gets written into the volume.
|
|
||||||
|
|
||||||
## Server-side handler change
|
|
||||||
|
|
||||||
`internal/server/http/agent_assets.go` today reads from
|
|
||||||
`<DataDir>/agent-binaries/<name>` and `<DataDir>/install/<name>`.
|
|
||||||
|
|
||||||
Change: if the file isn't present in `<DataDir>`, fall back to
|
|
||||||
`/opt/restic-manager/dist/<subpath>/<name>`. The fallback path is a
|
|
||||||
new server-config field defaulted to `/opt/restic-manager/dist`,
|
|
||||||
overridable via `RM_BUNDLED_ASSETS_DIR` for tests and source-build
|
|
||||||
deployments. If neither path resolves, return 404 (existing
|
|
||||||
`binary_not_published` / `not_found` body unchanged).
|
|
||||||
|
|
||||||
This means:
|
|
||||||
- A fresh container without any operator-staged overrides serves the
|
|
||||||
baked-in agents. No first-run setup needed.
|
|
||||||
- An operator can still drop a custom-built agent into
|
|
||||||
`<DataDir>/agent-binaries/` to override the image's copy (handy for
|
|
||||||
pre-release agent testing without rebuilding the server image).
|
|
||||||
- Source-build dev (`bin/restic-manager-server` running out of the
|
|
||||||
working tree) still works exactly as today — the fallback dir is
|
|
||||||
configurable, and the `<DataDir>` path remains the primary lookup.
|
|
||||||
|
|
||||||
Tests cover four cases: (a) DataDir hit, (b) fallback hit, (c) DataDir
|
|
||||||
hit shadows fallback, (d) neither — 404.
|
|
||||||
|
|
||||||
## Versioning
|
|
||||||
|
|
||||||
Both binaries grow `commit` and `date` ldflag-targets next to the
|
|
||||||
existing `version`:
|
|
||||||
|
|
||||||
```go
|
|
||||||
var (
|
|
||||||
version = "dev"
|
|
||||||
commit = "none"
|
|
||||||
date = "unknown"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Dockerfile gains `ARG VERSION`, `ARG COMMIT`, `ARG DATE`, all
|
|
||||||
`""`-defaulted; the `go build` line passes them via `-ldflags`. The
|
|
||||||
release workflow fills them from `${{ gitea.ref_name }}`,
|
|
||||||
`${{ gitea.sha }}`, and a UTC ISO-8601 timestamp.
|
|
||||||
|
|
||||||
Snapshot builds (workflow_dispatch) compute
|
|
||||||
`VERSION=0.0.0-snapshot-${SHORTSHA}` and tag the image as
|
|
||||||
`:snapshot-${SHORTSHA}` only. They never touch `latest` or any
|
|
||||||
`vX.Y.Z` tag.
|
|
||||||
|
|
||||||
## Workflow (`.gitea/workflows/release.yml`)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
name: Release
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
tags: ['v[0-9]+.[0-9]+.[0-9]+']
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
env:
|
|
||||||
IMAGE: gitea.dcglab.co.uk/${{ gitea.repository }}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
image:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: docker/setup-qemu-action@v3
|
|
||||||
- uses: docker/setup-buildx-action@v3
|
|
||||||
- uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
registry: gitea.dcglab.co.uk
|
|
||||||
username: ${{ gitea.actor }}
|
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- name: compute tags
|
|
||||||
id: meta
|
|
||||||
run: |
|
|
||||||
# tag-push → :vX.Y.Z, :X.Y, :X (only :latest if X >= 1)
|
|
||||||
# dispatch → :snapshot-<shortsha>
|
|
||||||
...
|
|
||||||
- uses: docker/build-push-action@v6
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
file: deploy/Dockerfile.server
|
|
||||||
platforms: linux/amd64,linux/arm64
|
|
||||||
push: true
|
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
|
||||||
build-args: |
|
|
||||||
VERSION=${{ steps.meta.outputs.version }}
|
|
||||||
COMMIT=${{ gitea.sha }}
|
|
||||||
DATE=${{ steps.meta.outputs.date }}
|
|
||||||
```
|
|
||||||
|
|
||||||
The `compute tags` step:
|
|
||||||
|
|
||||||
- For `push:tags`: extract `vMAJOR.MINOR.PATCH`. Always emit
|
|
||||||
`:vMAJOR.MINOR.PATCH`, `:MAJOR.MINOR`, `:MAJOR`. Emit `:latest`
|
|
||||||
only when `MAJOR >= 1`.
|
|
||||||
- For `workflow_dispatch`: emit `:snapshot-<shortsha>`. Nothing else.
|
|
||||||
|
|
||||||
No release-asset upload step yet — the GHCR-equivalent registry push
|
|
||||||
is the deliverable. A future iteration may attach a `SHA256SUMS` file
|
|
||||||
to a Gitea release object once `tea release create` is wired in;
|
|
||||||
that's not in scope for the first cut.
|
|
||||||
|
|
||||||
## Tests / verification
|
|
||||||
|
|
||||||
1. `go vet ./...` (CLAUDE.md rule, runs locally pre-commit).
|
|
||||||
2. `go test ./internal/server/http/...` covers the new fallback
|
|
||||||
logic.
|
|
||||||
3. Local manual smoke: `docker build -f deploy/Dockerfile.server .`
|
|
||||||
produces an image; `docker run --rm <image>` starts the server;
|
|
||||||
`curl http://127.0.0.1:8080/agent/binary?os=linux&arch=amd64`
|
|
||||||
serves bytes; `curl http://127.0.0.1:8080/install/install.sh`
|
|
||||||
serves the script.
|
|
||||||
4. Release workflow itself is exercised on first tag-push; until
|
|
||||||
then, `workflow_dispatch` is the smoke test.
|
|
||||||
|
|
||||||
## Operator-facing changes
|
|
||||||
|
|
||||||
- `README.md` install snippet becomes
|
|
||||||
`docker run -v rm-data:/var/lib/restic-manager ...
|
|
||||||
gitea.dcglab.co.uk/<owner>/restic-manager:vX.Y.Z`. Pre-1.0
|
|
||||||
releases are pinned by exact tag; no `:latest` is published.
|
|
||||||
- The CLAUDE.md "restage" block is dev-only (smoke env runs the
|
|
||||||
server out of `bin/`). Production users on the image never see
|
|
||||||
it.
|
|
||||||
- `RM_BUNDLED_ASSETS_DIR` is documented in the server config
|
|
||||||
reference (defaults to `/opt/restic-manager/dist`).
|
|
||||||
|
|
||||||
## Risks / footguns
|
|
||||||
|
|
||||||
- **Image size growth.** Three agent binaries (~15-20 MB each
|
|
||||||
stripped) add ~50 MB. Acceptable; we're already shipping a
|
|
||||||
distroless server. Watch the trajectory once Phase 4 alerting is
|
|
||||||
in.
|
|
||||||
- **Dockerfile cross-compile multiplies build time** on the runner.
|
|
||||||
Pure-Go means each leg is just a `go build`; total stage time
|
|
||||||
should stay under 60s on the self-hosted runner.
|
|
||||||
- **`ARG VERSION` leakage.** The current Dockerfile already accepts
|
|
||||||
`ARG VERSION=dev`; we're tightening, not loosening.
|
|
||||||
- **Operator overriding `<DataDir>/agent-binaries/<name>`** with a
|
|
||||||
stale binary will silently shadow the image's copy. Documented in
|
|
||||||
the server config reference; this is a feature (lets operators
|
|
||||||
hot-patch a pre-release agent) not a bug.
|
|
||||||
|
|
||||||
## Out of scope (tracked for follow-up)
|
|
||||||
|
|
||||||
- Cosign / SBOM / in-toto provenance — defer to Phase 6 with the rest
|
|
||||||
of the supply-chain hardening.
|
|
||||||
- GHCR mirror — defer until P5-01 docs site goes public.
|
|
||||||
- `tea release create` integration — pending until we have something
|
|
||||||
worth attaching beyond the image digest.
|
|
||||||
@@ -1,448 +0,0 @@
|
|||||||
# P6-01 + P6-02 — Agent self-update + fleet update
|
|
||||||
|
|
||||||
Status: design approved 2026-05-06.
|
|
||||||
Scope: P6-01 (agent self-update mechanism) and P6-02 (dashboard
|
|
||||||
version reporting + fleet update UI). One spec, one branch — the
|
|
||||||
two tasks are tightly coupled (P6-02 is the operator surface for
|
|
||||||
the mechanism P6-01 ships).
|
|
||||||
|
|
||||||
## 1. Background
|
|
||||||
|
|
||||||
P5-03 pivoted release distribution to a single multi-arch server
|
|
||||||
Docker image, with cross-compiled agent binaries baked under
|
|
||||||
`/opt/restic-manager/dist/agent-binaries/` and served via
|
|
||||||
`GET /agent/binary?os=…&arch=…`. The plumbing already does
|
|
||||||
dual-path lookup: `<DataDir>/agent-binaries/<name>` overrides the
|
|
||||||
image-baked copy, so an operator can hot-patch a pre-release agent
|
|
||||||
without rebuilding the image.
|
|
||||||
|
|
||||||
That makes the server the natural distribution point for agent
|
|
||||||
upgrades. "Update agent" collapses to "re-fetch from your own
|
|
||||||
server" — no apt repo, no Chocolatey, no third-party signing infra,
|
|
||||||
and version pinning is automatic because the server only ever
|
|
||||||
serves the agent that matches its own release.
|
|
||||||
|
|
||||||
This spec wires up the update mechanism end-to-end and the
|
|
||||||
operator surface that drives it.
|
|
||||||
|
|
||||||
## 2. Decisions
|
|
||||||
|
|
||||||
| # | Decision | Rationale |
|
|
||||||
|---|----------|-----------|
|
|
||||||
| 1 | Operator-driven only — no auto-update | Matches the rest of the app's job-dispatch model; avoids "bad release upgrades every host instantly"; auto-update can be added later as a setting flip if asked |
|
|
||||||
| 2 | Linux: just exit, let systemd restart. Windows: detached helper script. | Linux supports rename-while-open; Windows holds an exclusive lock on the running .exe |
|
|
||||||
| 3 | M1 (keep `agent.old` on disk) + M2 (rolling fleet update with halt-on-fail). Skip M3 (auto-rollback watchdog). | M1 is ~5 lines, M2 falls naturally out of P6-02's UI, M3 is a lot of plumbing for "shipped a binary that doesn't start" |
|
|
||||||
| 4 | Skip sha256 digest verification for v1 | TLS already covers the corruption-in-transit threat; image-tampering is image-build's problem, not the agent's |
|
|
||||||
| 5 | Exact string version match for "out of date" | With server-bundled binaries there's exactly one canonical version per server image — anything else is out of date by definition |
|
|
||||||
| 6 | WS envelope only, no `restic-manager-agent update` CLI subcommand | YAGNI; no concrete consumer; the underlying logic is reusable when one appears |
|
|
||||||
|
|
||||||
## 3. Wire protocol
|
|
||||||
|
|
||||||
### 3.1 Server → agent: `command.update`
|
|
||||||
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"type": "command.update",
|
|
||||||
"id": "<envelope id>",
|
|
||||||
"payload": {
|
|
||||||
"job_id": "<ulid>"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
No `os` / `arch` / `version` in the payload — the agent already
|
|
||||||
knows its own build target and fetches from its configured server
|
|
||||||
URL via the existing `/agent/binary` handler. Including a target
|
|
||||||
version would also tempt the agent into version-comparison logic;
|
|
||||||
keep that on the server side.
|
|
||||||
|
|
||||||
### 3.2 Job lifecycle (server-driven)
|
|
||||||
|
|
||||||
The agent has limited ability to report on its own restart, so the
|
|
||||||
job state machine lives on the server:
|
|
||||||
|
|
||||||
- **queued → running** when the envelope is dispatched.
|
|
||||||
- **running → succeeded** when the agent re-hellos with
|
|
||||||
`agent_version == server.Version` after dispatch and within
|
|
||||||
the timeout. Audit `host.update_succeeded`.
|
|
||||||
- **running → failed (timeout)** if 90 seconds pass without a
|
|
||||||
hello carrying the matching version. Audit `host.update_failed`.
|
|
||||||
Raise alert kind `update_failed` (reuses P3-05 alert engine).
|
|
||||||
This single transition covers both the "agent never came back
|
|
||||||
at all" case and the "agent came back at the wrong version"
|
|
||||||
case — see §6.2 for why we don't transition immediately on a
|
|
||||||
mismatched hello.
|
|
||||||
|
|
||||||
Migration 0021 widens the `jobs.kind` CHECK constraint to include
|
|
||||||
`update`. Same column-level pattern as 0012 (where 0012 added
|
|
||||||
`restore` and `diff`).
|
|
||||||
|
|
||||||
## 4. Agent-side execution
|
|
||||||
|
|
||||||
Lives in `internal/agent/updater`, build-tag split:
|
|
||||||
|
|
||||||
- `updater_unix.go` — Linux + any future POSIX target.
|
|
||||||
- `updater_windows.go` — Windows-only, uses the helper-script
|
|
||||||
pattern.
|
|
||||||
- `updater.go` — shared `Update(ctx, serverURL string) error`
|
|
||||||
interface and the HTTP fetch/streaming code (no platform deps).
|
|
||||||
|
|
||||||
### 4.1 Linux flow
|
|
||||||
|
|
||||||
1. Receive `command.update` from the WS dispatcher.
|
|
||||||
2. Resolve own binary via `os.Executable()` and `filepath.Abs`.
|
|
||||||
Refuse if the resolved path is `/proc/self/exe` or otherwise
|
|
||||||
not a real file (defence in depth — shouldn't happen under
|
|
||||||
systemd, but bail loudly if it does).
|
|
||||||
3. `GET <server>/agent/binary?os=linux&arch=<runtime.GOARCH>`,
|
|
||||||
stream to `<binary>.new` in the same directory as the running
|
|
||||||
binary (same filesystem ⇒ atomic rename).
|
|
||||||
4. fsync the file, `os.Chmod(0755)`.
|
|
||||||
5. Copy current binary to `<binary>.old` (overwrite if it
|
|
||||||
exists). M1 — one-revision rollback target.
|
|
||||||
6. `os.Rename(<binary>.new, <binary>)`.
|
|
||||||
7. Close the WS connection cleanly (sends close frame so the
|
|
||||||
server transitions the connection to `disconnected` rather
|
|
||||||
than waiting for the heartbeat-miss sweep).
|
|
||||||
8. `os.Exit(0)`. Systemd's `Restart=always` (already in the unit)
|
|
||||||
brings up the new binary within seconds.
|
|
||||||
|
|
||||||
### 4.2 Windows flow
|
|
||||||
|
|
||||||
The .exe is exclusively locked by the OS while running, so steps
|
|
||||||
5–6 above can't happen in-process. Use a detached helper:
|
|
||||||
|
|
||||||
1. Steps 1–4 the same — fetch into `<binary>.exe.new`, fsync.
|
|
||||||
2. Write `update.cmd` to a tmp path with the orchestration:
|
|
||||||
```
|
|
||||||
timeout /t 3 /nobreak >nul
|
|
||||||
copy /Y "<binary>.exe" "<binary>.exe.old"
|
|
||||||
sc stop restic-manager-agent
|
|
||||||
:wait
|
|
||||||
sc query restic-manager-agent | find "STOPPED" >nul
|
|
||||||
if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
|
|
||||||
move /Y "<binary>.exe.new" "<binary>.exe"
|
|
||||||
sc start restic-manager-agent
|
|
||||||
del "%~f0"
|
|
||||||
```
|
|
||||||
3. `CreateProcess` it detached
|
|
||||||
(`DETACHED_PROCESS | CREATE_NO_WINDOW`, no parent handles).
|
|
||||||
4. Close WS, `os.Exit(0)`. SCM sees clean stop and waits — does
|
|
||||||
*not* try to restart, because `sc stop` is the helper's job,
|
|
||||||
not a crash. (`Restart=always` semantics differ between
|
|
||||||
systemd and SCM. SCM treats clean-exit-after-stop as
|
|
||||||
intentional and does not auto-restart; only crashes restart.
|
|
||||||
That's why the helper script needs the explicit `sc start`
|
|
||||||
at the end.)
|
|
||||||
|
|
||||||
### 4.3 Service-user assumption
|
|
||||||
|
|
||||||
Both Linux (`User=root` per the existing unit) and Windows
|
|
||||||
(`LocalSystem` by default) can write the binary path directly. If
|
|
||||||
the agent ever moves to a non-root service user, the updater
|
|
||||||
breaks — would need either a setuid helper or an out-of-process
|
|
||||||
update service. Add a `// NOTE:` comment in the updater package
|
|
||||||
flagging this; not a v1 blocker.
|
|
||||||
|
|
||||||
## 5. Server build version
|
|
||||||
|
|
||||||
New package `internal/version` exposing two constants:
|
|
||||||
|
|
||||||
```
|
|
||||||
package version
|
|
||||||
|
|
||||||
var (
|
|
||||||
Version = "dev"
|
|
||||||
Commit = ""
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Wired via `-ldflags` in the Makefile:
|
|
||||||
|
|
||||||
```
|
|
||||||
GO_LDFLAGS = -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION) \
|
|
||||||
-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT)
|
|
||||||
|
|
||||||
VERSION := $(shell git describe --tags --always --dirty)
|
|
||||||
COMMIT := $(shell git rev-parse --short HEAD)
|
|
||||||
```
|
|
||||||
|
|
||||||
Both `cmd/server` and `cmd/agent` link the same package, so an
|
|
||||||
agent's `agent_version` (sent in the hello payload, already wired
|
|
||||||
since P1-11) is comparable byte-for-byte to the server's
|
|
||||||
`version.Version`.
|
|
||||||
|
|
||||||
`make build` already does what's needed for source builds. The
|
|
||||||
Phase 2 work in this spec is the Docker release path — confirm
|
|
||||||
during plan execution that `.gitea/workflows/release.yml` passes
|
|
||||||
`VERSION` and `COMMIT` into the Docker `--build-arg` chain so the
|
|
||||||
in-image binaries embed the same string the image is tagged with.
|
|
||||||
If not, add the wiring.
|
|
||||||
|
|
||||||
Dirty/dev builds (`v1.2.3-dirty`) won't match clean server builds,
|
|
||||||
so every dev environment will show every host as out-of-date. This
|
|
||||||
is acceptable — the chip is a noop in dev, real ops always run
|
|
||||||
tagged builds.
|
|
||||||
|
|
||||||
A new `GET /api/version` endpoint returns
|
|
||||||
`{"version": "...", "commit": "..."}`. Used by the dashboard
|
|
||||||
header tile and by `/settings/fleet-update`. Public-band — exposes
|
|
||||||
no secrets, lets the install scripts surface it too.
|
|
||||||
|
|
||||||
## 6. P6-01 server endpoints
|
|
||||||
|
|
||||||
### 6.1 `POST /api/hosts/{id}/update`
|
|
||||||
|
|
||||||
Admin-only. Refuses (with structured error code) when:
|
|
||||||
|
|
||||||
- Host is offline (`host_offline`).
|
|
||||||
- Host's `agent_version == server.Version` (`already_up_to_date`).
|
|
||||||
- An update job for this host is already running (`update_in_progress`).
|
|
||||||
|
|
||||||
Happy path: creates `jobs` row with `kind=update`, dispatches
|
|
||||||
`command.update` envelope, audit-logs `host.update_dispatched`,
|
|
||||||
returns `{"job_id": "..."}`.
|
|
||||||
|
|
||||||
UI form-post variant on `/hosts/{id}/update` returns
|
|
||||||
`HX-Redirect` to the live job log.
|
|
||||||
|
|
||||||
### 6.2 Hello handler integration
|
|
||||||
|
|
||||||
The existing `onAgentHello` (P1-11) already upserts
|
|
||||||
`agent_version`. Extend it: after the upsert, look for any
|
|
||||||
`update` job for this host with `status='running'`. If one
|
|
||||||
exists:
|
|
||||||
|
|
||||||
- `agent_version == server.Version` → mark job `succeeded`,
|
|
||||||
audit `host.update_succeeded`.
|
|
||||||
- `agent_version != server.Version` → leave the job running so
|
|
||||||
the timeout path catches it as a rollback failure (don't fail
|
|
||||||
immediately — gives the agent one chance to come back, restart,
|
|
||||||
hello again with the right version).
|
|
||||||
|
|
||||||
Adds a small in-memory map of pending updates so the timeout
|
|
||||||
goroutine knows when to give up. Persisted state lives in the
|
|
||||||
`jobs` table; the in-memory map is just for the timer.
|
|
||||||
|
|
||||||
## 7. P6-02 fleet update
|
|
||||||
|
|
||||||
### 7.1 Schema
|
|
||||||
|
|
||||||
Migration 0022, column-level adds only:
|
|
||||||
|
|
||||||
```
|
|
||||||
CREATE TABLE fleet_updates (
|
|
||||||
id TEXT PRIMARY KEY,
|
|
||||||
started_at TEXT NOT NULL,
|
|
||||||
started_by_user_id TEXT NOT NULL REFERENCES users(id),
|
|
||||||
target_version TEXT NOT NULL,
|
|
||||||
status TEXT NOT NULL CHECK (status IN ('running','completed','halted','cancelled')),
|
|
||||||
current_host_id TEXT REFERENCES hosts(id),
|
|
||||||
halted_reason TEXT,
|
|
||||||
completed_at TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE fleet_update_hosts (
|
|
||||||
fleet_update_id TEXT NOT NULL REFERENCES fleet_updates(id) ON DELETE CASCADE,
|
|
||||||
host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
|
|
||||||
status TEXT NOT NULL CHECK (status IN ('pending','running','succeeded','failed','skipped')),
|
|
||||||
job_id TEXT REFERENCES jobs(id),
|
|
||||||
failed_reason TEXT,
|
|
||||||
PRIMARY KEY (fleet_update_id, host_id)
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
### 7.2 Worker loop
|
|
||||||
|
|
||||||
A single in-process goroutine — at most one fleet update may run
|
|
||||||
at a time (enforced via a `sync.Mutex` + a precondition check on
|
|
||||||
`POST /api/fleet/update`).
|
|
||||||
|
|
||||||
```
|
|
||||||
for each pending fleet_update_hosts row in dispatch order:
|
|
||||||
set fleet_updates.current_host_id = row.host_id
|
|
||||||
set fleet_update_hosts.status = 'running'
|
|
||||||
if host.agent_version == server.Version:
|
|
||||||
# Already updated since we built the list — skip.
|
|
||||||
set status = 'skipped'; continue
|
|
||||||
if !host.online:
|
|
||||||
# Offline since we built the list — halt.
|
|
||||||
halt(reason="host went offline")
|
|
||||||
return
|
|
||||||
dispatch_update_for_host(host) # reuses 6.1 logic
|
|
||||||
wait_up_to_90s_for_hello_with_matching_version()
|
|
||||||
if matched:
|
|
||||||
set status = 'succeeded'; continue
|
|
||||||
else:
|
|
||||||
set status = 'failed', failed_reason = "..."
|
|
||||||
halt(reason="update failed on host X")
|
|
||||||
return
|
|
||||||
set fleet_updates.status = 'completed', completed_at = now
|
|
||||||
```
|
|
||||||
|
|
||||||
Halt: set `fleet_updates.status = 'halted'`, raise an alert kind
|
|
||||||
`fleet_update_halted`, audit `fleet.update_halted` with the host
|
|
||||||
id and reason. Subsequent hosts stay `pending` so the operator can
|
|
||||||
see what was queued and decide whether to resume (resume = start a
|
|
||||||
new fleet update with the still-out-of-date subset).
|
|
||||||
|
|
||||||
Cancel: admin-only `POST /api/fleet-updates/{id}/cancel`. Sets
|
|
||||||
`status='cancelled'`. The currently-dispatched host's update job
|
|
||||||
keeps running (the agent is already mid-restart) — cancel only
|
|
||||||
prevents the *next* host from being picked. Audit
|
|
||||||
`fleet.update_cancelled`.
|
|
||||||
|
|
||||||
### 7.3 UI surfaces
|
|
||||||
|
|
||||||
**Per-host chip (host_row partial + host detail chrome):**
|
|
||||||
|
|
||||||
`out of date · v1.2.2 → v1.2.3` — amber-accented, mirrors `.tag`
|
|
||||||
token shape. Only rendered when:
|
|
||||||
|
|
||||||
```
|
|
||||||
host.agent_version != "" && host.agent_version != server.Version
|
|
||||||
```
|
|
||||||
|
|
||||||
Empty `agent_version` (host enrolled but never connected) renders
|
|
||||||
nothing rather than "out of date" — we don't know what version
|
|
||||||
they have.
|
|
||||||
|
|
||||||
**Dashboard summary tile:**
|
|
||||||
|
|
||||||
The hero strip already has tiles. Add an "Updates" tile:
|
|
||||||
`N hosts behind` linking to `/?updates=behind` (extends NS-04's
|
|
||||||
filter machinery — adds an `updates` query param alongside
|
|
||||||
`status`/`repo_status`/`tag`). Hidden when N == 0.
|
|
||||||
|
|
||||||
**Per-host Update button on `/hosts/{id}`:**
|
|
||||||
|
|
||||||
Right-rail, admin-only. Disabled with hover tooltip when host
|
|
||||||
offline / already up to date / update in progress. POSTs to
|
|
||||||
`/hosts/{id}/update`, `HX-Redirect` to the live job log.
|
|
||||||
|
|
||||||
**Fleet update page `/settings/fleet-update`:**
|
|
||||||
|
|
||||||
Admin-only. Two states:
|
|
||||||
|
|
||||||
- **Idle**: lists out-of-date online hosts (table: hostname,
|
|
||||||
current version, target version, last seen). Big "Start rolling
|
|
||||||
update" button behind a typed-confirm dialog (operator types
|
|
||||||
the host count, e.g. `12`, to enable the button — same shape as
|
|
||||||
the host-delete confirm).
|
|
||||||
- **Running/halted/completed**: shows the currently-active
|
|
||||||
fleet_update row + per-host progress list. Polls every 3s (htmx
|
|
||||||
trigger conditional on `document.visibilityState === 'visible'`,
|
|
||||||
same pattern as the alerts page). Renders:
|
|
||||||
```
|
|
||||||
Updated 3/12 · currently updating <hostname>
|
|
||||||
Halted on <hostname>: <reason> · job log →
|
|
||||||
```
|
|
||||||
|
|
||||||
Audit actions: `fleet.update_started`, `fleet.update_completed`,
|
|
||||||
`fleet.update_halted`, `fleet.update_cancelled`.
|
|
||||||
|
|
||||||
### 7.4 Alert engine integration
|
|
||||||
|
|
||||||
P3-05's alert engine already supports kind-based registration. Add
|
|
||||||
two new kinds:
|
|
||||||
|
|
||||||
- `update_failed` — per-host, raised on individual update failure.
|
|
||||||
Auto-resolves when the host re-hellos with the matching version.
|
|
||||||
- `fleet_update_halted` — global, raised on fleet halt. Auto-resolves
|
|
||||||
when a subsequent fleet update completes successfully.
|
|
||||||
|
|
||||||
## 8. RBAC
|
|
||||||
|
|
||||||
| Endpoint | Role |
|
|
||||||
|----------|------|
|
|
||||||
| `POST /api/hosts/{id}/update` | admin |
|
|
||||||
| `POST /api/fleet/update` | admin |
|
|
||||||
| `POST /api/fleet-updates/{id}/cancel` | admin |
|
|
||||||
| `GET /api/fleet-updates/{id}` | admin (status polling) |
|
|
||||||
| `GET /api/version` | public |
|
|
||||||
|
|
||||||
Operator and viewer see the "out of date" chip but no update
|
|
||||||
buttons. Mirrors the existing pattern: read affordances are
|
|
||||||
visible to all roles, write affordances are gated.
|
|
||||||
|
|
||||||
## 9. Testing
|
|
||||||
|
|
||||||
### 9.1 Unit
|
|
||||||
|
|
||||||
- `internal/agent/updater`: fake-`/agent/binary` HTTP server +
|
|
||||||
tmp "running binary" file, assert post-state — binary swapped,
|
|
||||||
`.old` present, no leftover `.new`. Linux path only (Windows
|
|
||||||
helper covered by build-tag compile-only).
|
|
||||||
- `internal/server/http`: `POST /api/hosts/{id}/update` happy
|
|
||||||
path, refuses-when-offline, refuses-when-up-to-date,
|
|
||||||
refuses-when-update-in-progress, RBAC enforcement, audit row
|
|
||||||
written.
|
|
||||||
- Hello handler: agent reconnects with matching version after
|
|
||||||
`update` job dispatch → marks job `succeeded`, drops the
|
|
||||||
in-memory pending entry. Mismatched version → no-op (timeout
|
|
||||||
catches it).
|
|
||||||
- Timeout path: synthetic `update` job + 90s elapsed →
|
|
||||||
marks `failed`, raises alert.
|
|
||||||
- Fleet worker: table-driven over the loop's state machine —
|
|
||||||
success-then-success, success-then-timeout-halts,
|
|
||||||
cancel-mid-flight, no-online-out-of-date-hosts-completes-immediately,
|
|
||||||
host-disappears-from-list-mid-loop-skips.
|
|
||||||
|
|
||||||
### 9.2 Smoke validation (per CLAUDE.md restage block)
|
|
||||||
|
|
||||||
1. Build server + agent at version A. Restage. Enrol a host;
|
|
||||||
confirm `agent_version=A`.
|
|
||||||
2. Bump version to B (`make build VERSION=B`), rebuild server
|
|
||||||
only, restart server. Dashboard shows host as out-of-date with
|
|
||||||
`A → B` chip. Updates tile reads "1 host behind".
|
|
||||||
3. Rebuild agent at B, restage `<DataDir>/agent-binaries/`. Click
|
|
||||||
**Update agent** on host detail. Agent fetches, swaps, exits;
|
|
||||||
systemd restarts it; hello-back at B → job `succeeded`, chip
|
|
||||||
gone, tile clears.
|
|
||||||
4. Rollback path: leave `<DataDir>/agent-binaries/` at A, server
|
|
||||||
at B, click Update — agent fetches A, swaps to A, restarts at
|
|
||||||
A; hello says A != B; server marks job `failed` after 90s with
|
|
||||||
reason "agent reconnected at version A, expected B".
|
|
||||||
5. Fleet update: spin up two smoke hosts both out-of-date, fire
|
|
||||||
**Start rolling update**, watch progress page tick host 1 →
|
|
||||||
host 2 → completed.
|
|
||||||
6. Halt path: replace one of the `<DataDir>/agent-binaries/`
|
|
||||||
files with `/bin/false`. Run fleet update. First host gets
|
|
||||||
broken binary, fails to come back up, fleet update halts at
|
|
||||||
host 1 after 90s, alert raised, host 2 left as `pending`.
|
|
||||||
|
|
||||||
Step 6 validates M2 end-to-end — the rolling halt is the actual
|
|
||||||
safety guarantee, not a nice-to-have.
|
|
||||||
|
|
||||||
## 10. Out of scope
|
|
||||||
|
|
||||||
- sha256 digest verification (deferred — see decision 4).
|
|
||||||
- `restic-manager-agent update` CLI subcommand (deferred —
|
|
||||||
decision 6).
|
|
||||||
- Auto-update (deferred — decision 1).
|
|
||||||
- Auto-rollback watchdog M3 (deferred — decision 3).
|
|
||||||
- Migrating the agent off `User=root` (separate hardening track).
|
|
||||||
- Cross-version protocol-compatibility checks beyond the existing
|
|
||||||
`protocol_version` handshake (P1-11). If the new agent's
|
|
||||||
`protocol_version` is incompatible with the server, the
|
|
||||||
existing handshake rejects it; the update job will then
|
|
||||||
correctly time out and be marked failed.
|
|
||||||
|
|
||||||
## 11. Migration plan
|
|
||||||
|
|
||||||
1. `internal/version` package + Makefile ldflags wiring.
|
|
||||||
2. Migration 0021 (jobs.kind widening) + 0022 (fleet_updates
|
|
||||||
tables).
|
|
||||||
3. `internal/agent/updater` package, Linux first.
|
|
||||||
4. WS envelope wiring + `command.update` dispatcher.
|
|
||||||
5. `POST /api/hosts/{id}/update` + hello-handler integration +
|
|
||||||
timeout goroutine.
|
|
||||||
6. UI: chip + per-host update button + dashboard tile + filter.
|
|
||||||
7. Fleet update worker + page.
|
|
||||||
8. Windows updater path.
|
|
||||||
9. Alert engine kinds.
|
|
||||||
10. Smoke validation per §9.2.
|
|
||||||
|
|
||||||
Each step is independently testable; commits should land at each
|
|
||||||
boundary so a failed Windows path (8) doesn't block the rest of
|
|
||||||
the work.
|
|
||||||
@@ -1,223 +0,0 @@
|
|||||||
# P6-03 — Repo size trend graphs
|
|
||||||
|
|
||||||
Sparkline on the dashboard host row + full chart on the host repo
|
|
||||||
page, both showing repo growth over time. Closes the last
|
|
||||||
operator-visibility gap in Phase 6 alongside Prometheus metrics
|
|
||||||
(P6-04).
|
|
||||||
|
|
||||||
## Goals
|
|
||||||
|
|
||||||
- Operators can see at a glance whether a host's repo is growing,
|
|
||||||
stable, or shrinking, without leaving the dashboard.
|
|
||||||
- A second screen on the repo page exposes the same data over a
|
|
||||||
longer window with a snapshot-count overlay so retention
|
|
||||||
behaviour can be eyeballed against size.
|
|
||||||
- Zero new client-side dependencies; matches the existing
|
|
||||||
HTMX + server-rendered idiom used everywhere else in the UI.
|
|
||||||
|
|
||||||
## Non-goals
|
|
||||||
|
|
||||||
- No backfill of historical data. Trend lights up with whatever
|
|
||||||
the agents report from the day this ships.
|
|
||||||
- No per-source-group breakdown — repo-level only.
|
|
||||||
- No alerting on growth rate (dedicated to a future ticket if a
|
|
||||||
user asks).
|
|
||||||
- No JSON API surface. Prometheus exposure is P6-04, separate.
|
|
||||||
|
|
||||||
## Decisions taken in brainstorming
|
|
||||||
|
|
||||||
- **Metrics:** `total_size_bytes` (sparkline + chart) and
|
|
||||||
`snapshot_count` (chart only). Raw size dropped as redundant.
|
|
||||||
- **Cadence:** one row per `(host_id, UTC date)`, last-write-wins
|
|
||||||
per column. Bounded at ~365 rows/host/year regardless of job
|
|
||||||
frequency.
|
|
||||||
- **Backfill:** none. Pure forward-fill from launch day.
|
|
||||||
- **Rendering:** server-rendered inline SVG, no JS library.
|
|
||||||
- **Spans:** sparkline fixed at 30 days; chart has `30d | 90d | 1y`
|
|
||||||
range selector, server-rendered swap.
|
|
||||||
|
|
||||||
## Schema
|
|
||||||
|
|
||||||
New migration `internal/store/migrations/0023_host_repo_stats_history.sql`:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE host_repo_stats_history (
|
|
||||||
host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
|
|
||||||
day TEXT NOT NULL, -- 'YYYY-MM-DD' UTC
|
|
||||||
total_size_bytes INTEGER, -- nullable; partial patches don't overwrite
|
|
||||||
snapshot_count INTEGER, -- nullable
|
|
||||||
recorded_at TEXT NOT NULL, -- RFC3339Nano of last write touching this row
|
|
||||||
PRIMARY KEY (host_id, day)
|
|
||||||
);
|
|
||||||
CREATE INDEX host_repo_stats_history_host_day
|
|
||||||
ON host_repo_stats_history(host_id, day DESC);
|
|
||||||
```
|
|
||||||
|
|
||||||
FK cascade matches every other host-scoped table; deleting a host
|
|
||||||
through `Store.DeleteHost` (NS-01) wipes its history automatically.
|
|
||||||
|
|
||||||
## Write path
|
|
||||||
|
|
||||||
Hook the existing `MsgRepoStats` handler in
|
|
||||||
`internal/server/ws/handler.go` (around line 319). After the
|
|
||||||
existing `UpsertHostRepoStats(ctx, hostID, patch)` call, append:
|
|
||||||
|
|
||||||
```go
|
|
||||||
day := time.Now().UTC().Format("2006-01-02")
|
|
||||||
if err := deps.Store.UpsertHostRepoStatsHistory(ctx, hostID, day, patch); err != nil {
|
|
||||||
slog.Warn("ws: upsert host repo stats history", "host_id", hostID, "err", err)
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
A history-write failure is logged and dropped — never blocks the
|
|
||||||
main upsert. The partial-update contract that
|
|
||||||
`UpsertHostRepoStats` already implements is preserved at the
|
|
||||||
history layer:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
INSERT INTO host_repo_stats_history (host_id, day, total_size_bytes, snapshot_count, recorded_at)
|
|
||||||
VALUES (?, ?, ?, ?, ?)
|
|
||||||
ON CONFLICT(host_id, day) DO UPDATE SET
|
|
||||||
total_size_bytes = COALESCE(excluded.total_size_bytes, host_repo_stats_history.total_size_bytes),
|
|
||||||
snapshot_count = COALESCE(excluded.snapshot_count, host_repo_stats_history.snapshot_count),
|
|
||||||
recorded_at = excluded.recorded_at;
|
|
||||||
```
|
|
||||||
|
|
||||||
This is critical: the agent's prune handler in
|
|
||||||
`internal/agent/runner/runner.go:318` emits a stats patch that
|
|
||||||
only carries `LastPruneAt`. Without `COALESCE`, that prune ack
|
|
||||||
would null out a `total_size_bytes` we'd already captured from a
|
|
||||||
backup earlier the same day.
|
|
||||||
|
|
||||||
## Read path
|
|
||||||
|
|
||||||
Two new helpers in `internal/store/host_repo_stats_history.go`:
|
|
||||||
|
|
||||||
```go
|
|
||||||
type RepoStatsHistoryPoint struct {
|
|
||||||
Day time.Time // 00:00:00 UTC
|
|
||||||
TotalSizeBytes *int64
|
|
||||||
SnapshotCount *int64
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Store) ListHostRepoStatsHistory(
|
|
||||||
ctx context.Context, hostID string, since time.Time,
|
|
||||||
) ([]RepoStatsHistoryPoint, error)
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns rows ordered by `day` ascending where at least one metric
|
|
||||||
is non-null. The renderer connects available points with a
|
|
||||||
straight line — there is no explicit gap representation. A host
|
|
||||||
that was offline for a week shows a single segment spanning the
|
|
||||||
gap, which is the right visual: the repo state didn't change.
|
|
||||||
|
|
||||||
## Rendering
|
|
||||||
|
|
||||||
New package `internal/web/sparkline`. Pure Go, no template
|
|
||||||
dependency:
|
|
||||||
|
|
||||||
```go
|
|
||||||
type Series struct {
|
|
||||||
Name string
|
|
||||||
Points []float64 // nil-points represented as math.NaN
|
|
||||||
Stroke string // CSS color
|
|
||||||
}
|
|
||||||
|
|
||||||
func RenderSparkline(points []float64, width, height int) template.HTML
|
|
||||||
func RenderChart(series []Series, days []time.Time, opts ChartOpts) template.HTML
|
|
||||||
```
|
|
||||||
|
|
||||||
`RenderChart` produces a 600×220 SVG with:
|
|
||||||
|
|
||||||
- Light horizontal gridlines (4 bands).
|
|
||||||
- Two y-axes: bytes (left, blue) and count (right, amber). Each
|
|
||||||
series is normalised against its own axis.
|
|
||||||
- X-axis labels at start, midpoint, and end of the window.
|
|
||||||
- Per-point `<circle>` with a `<title>` for hover tooltips —
|
|
||||||
accessible by default, no JS.
|
|
||||||
- Empty state: faint dashed baseline + centered "no data yet"
|
|
||||||
text.
|
|
||||||
|
|
||||||
Sparkline is 80×20, single blue polyline, single `<title>` on the
|
|
||||||
group element showing `"current → 30d ago"`.
|
|
||||||
|
|
||||||
Two new partials:
|
|
||||||
|
|
||||||
- `web/templates/partials/repo_size_sparkline.html`
|
|
||||||
- `web/templates/partials/repo_size_chart.html`
|
|
||||||
|
|
||||||
Both call into the renderer with the appropriate opts. No
|
|
||||||
inline `<style>` — colours come from existing Tailwind palette
|
|
||||||
classes already used elsewhere (`text-blue-500`, `text-amber-500`).
|
|
||||||
|
|
||||||
## UI placement
|
|
||||||
|
|
||||||
### Dashboard host row
|
|
||||||
|
|
||||||
`web/templates/partials/host_row.html` gains one `<td>` between
|
|
||||||
the existing "Repo size" cell and "Snapshots" cell. Width ≈ 88px.
|
|
||||||
Cell renders the sparkline partial; if `len(points) < 2` the cell
|
|
||||||
shows "—" centred (matches the existing no-data idiom for
|
|
||||||
last-backup time in the same partial).
|
|
||||||
|
|
||||||
The dashboard's existing 5-second htmx live-refresh
|
|
||||||
(`hx-trigger="every 5s ..."` from NS-04) re-renders this cell
|
|
||||||
along with the rest of the row. No extra polling.
|
|
||||||
|
|
||||||
### Host repo page
|
|
||||||
|
|
||||||
`web/templates/pages/host_repo.html` gains a "Trend" panel
|
|
||||||
inserted between the existing summary panel and the maintenance
|
|
||||||
panel. Panel contains:
|
|
||||||
|
|
||||||
- Range pills `30d | 90d | 1y` (anchor links with
|
|
||||||
`hx-get="/hosts/{id}/repo/trend?range=…"` and
|
|
||||||
`hx-target="#repo-trend-chart" hx-swap="outerHTML"`).
|
|
||||||
- The chart partial wrapped in `<div id="repo-trend-chart">`.
|
|
||||||
- A small legend strip below the chart.
|
|
||||||
|
|
||||||
## Endpoints
|
|
||||||
|
|
||||||
- `GET /hosts/{id}/repo/trend?range=30d|90d|1y` — admin/operator,
|
|
||||||
htmx fragment, returns the chart partial. Auth reuses the
|
|
||||||
existing host-scoped middleware on the `/hosts/{id}` family.
|
|
||||||
Invalid `range` falls back to 30d.
|
|
||||||
|
|
||||||
No new admin-only surface — anyone with read access to the host
|
|
||||||
can see the trend.
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
- `internal/store/host_repo_stats_history_test.go` — upsert
|
|
||||||
merges partial patches without nulling; ordering; since-day
|
|
||||||
filter; cascade on host delete.
|
|
||||||
- `internal/web/sparkline/sparkline_test.go` — golden SVG files
|
|
||||||
for: empty input, single point, full 30-day series, mixed
|
|
||||||
null points. Goldens live under `testdata/`.
|
|
||||||
- `internal/server/http/ui_repo_test.go` — trend panel renders
|
|
||||||
with seeded history; range selector swaps server-side; empty
|
|
||||||
state.
|
|
||||||
- `internal/server/http/ui_dashboard_test.go` — host row sparkline
|
|
||||||
cell present and renders SVG when points exist, "—" when not.
|
|
||||||
- Smoke after build: dashboard row shows sparkline once two days
|
|
||||||
of data exist; repo page chart toggles cleanly between ranges.
|
|
||||||
|
|
||||||
## Migration / rollout
|
|
||||||
|
|
||||||
- Schema migration is additive — no risk to existing tables.
|
|
||||||
- Write path is best-effort; on schema issue the main repo-stats
|
|
||||||
upsert is unaffected.
|
|
||||||
- No agent change required, so no fleet update needed.
|
|
||||||
|
|
||||||
## Acceptance
|
|
||||||
|
|
||||||
- After two days of operation, the dashboard sparkline shows a
|
|
||||||
visible line for any host that has run a backup or
|
|
||||||
maintenance op on both days.
|
|
||||||
- Host repo page renders the trend panel with the snapshot-count
|
|
||||||
overlay; range selector switches view without a full page
|
|
||||||
reload.
|
|
||||||
- `go test ./...` and `go vet ./...` clean.
|
|
||||||
- Smoke env exercise: backup → sparkline updates; range pills
|
|
||||||
swap; FK cascade verified by deleting a host and checking the
|
|
||||||
history table.
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
# Build a Linux container that runs the restic-manager agent against a
|
|
||||||
# sibling rest-server in the e2e compose stack. Used only by tests
|
|
||||||
# (e2e/compose.e2e.yml + .gitea/workflows/e2e.yml).
|
|
||||||
#
|
|
||||||
# Two stages:
|
|
||||||
# 1. golang:alpine to build the agent binary.
|
|
||||||
# 2. alpine:3.20 with the `restic` package + the built binary.
|
|
||||||
#
|
|
||||||
# Pinning by digest is intentional for CI reproducibility.
|
|
||||||
|
|
||||||
FROM golang:1.25-alpine AS build
|
|
||||||
WORKDIR /src
|
|
||||||
|
|
||||||
ENV CGO_ENABLED=0 \
|
|
||||||
GOFLAGS="-trimpath"
|
|
||||||
|
|
||||||
COPY go.mod go.sum* ./
|
|
||||||
RUN go mod download
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
ARG VERSION=e2e
|
|
||||||
RUN go build -ldflags="-s -w -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=${VERSION}" \
|
|
||||||
-o /out/restic-manager-agent ./cmd/agent
|
|
||||||
|
|
||||||
FROM alpine:3.20
|
|
||||||
RUN apk add --no-cache restic ca-certificates curl
|
|
||||||
COPY --from=build /out/restic-manager-agent /usr/local/bin/restic-manager-agent
|
|
||||||
|
|
||||||
# Agents normally run as root because backup paths often need it. The
|
|
||||||
# e2e fixture only backs up paths under /data which we own, so this
|
|
||||||
# container would tolerate a non-root user — but staying root keeps
|
|
||||||
# parity with the production install.
|
|
||||||
USER root
|
|
||||||
|
|
||||||
# The agent needs a writable directory for its config + secrets store.
|
|
||||||
RUN mkdir -p /etc/restic-manager /var/lib/restic-manager-agent
|
|
||||||
ENV RM_AGENT_CONFIG=/etc/restic-manager/agent.yaml
|
|
||||||
|
|
||||||
# The compose entrypoint sets the announce URL via env.
|
|
||||||
COPY e2e/agent-entrypoint.sh /usr/local/bin/entrypoint.sh
|
|
||||||
RUN chmod +x /usr/local/bin/entrypoint.sh
|
|
||||||
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
# Playwright runner for the e2e suite. Built and run by
|
|
||||||
# e2e/compose.e2e.yml so the test process sits on the same docker
|
|
||||||
# network as the server, agent, and rest-server. The previous setup
|
|
||||||
# ran Playwright on the workflow runner host and reached the server
|
|
||||||
# via 127.0.0.1:8080; that fails on Gitea's act-style runners
|
|
||||||
# because the workflow steps execute inside a runner container,
|
|
||||||
# not on the host where compose publishes its ports.
|
|
||||||
|
|
||||||
FROM mcr.microsoft.com/playwright:v1.59.1-jammy
|
|
||||||
|
|
||||||
WORKDIR /work
|
|
||||||
|
|
||||||
# Install npm deps in a separate layer keyed off package.json so
|
|
||||||
# changes to specs don't bust the dep cache.
|
|
||||||
COPY e2e/playwright/package.json /work/package.json
|
|
||||||
RUN npm install --no-audit --no-fund
|
|
||||||
|
|
||||||
COPY e2e/playwright/ /work/
|
|
||||||
|
|
||||||
ENV CI=1
|
|
||||||
ENTRYPOINT ["npx", "playwright", "test"]
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# Entrypoint for the e2e agent container.
|
|
||||||
#
|
|
||||||
# Three states:
|
|
||||||
# 1. Already enrolled (agent.yaml has a bearer): run the agent.
|
|
||||||
# 2. Token supplied via $RM_ENROL_TOKEN: enrol then run.
|
|
||||||
# 3. Otherwise: announce against $RM_SERVER and wait for an admin to
|
|
||||||
# accept us. The announce flow blocks until accepted, then drops
|
|
||||||
# straight into the normal run loop, so this is the test-friendly
|
|
||||||
# path.
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
CFG="${RM_AGENT_CONFIG:-/etc/restic-manager/agent.yaml}"
|
|
||||||
SERVER="${RM_SERVER:?set RM_SERVER}"
|
|
||||||
|
|
||||||
if [ -f "$CFG" ] && grep -q '^agent_token:' "$CFG"; then
|
|
||||||
exec restic-manager-agent -config "$CFG"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -n "${RM_ENROL_TOKEN:-}" ]; then
|
|
||||||
exec restic-manager-agent -config "$CFG" \
|
|
||||||
-enroll-server "$SERVER" \
|
|
||||||
-enroll-token "$RM_ENROL_TOKEN"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Announce-and-approve: blocks until an admin accepts, then runs.
|
|
||||||
exec restic-manager-agent -config "$CFG" -enroll-server "$SERVER"
|
|
||||||
@@ -1,108 +0,0 @@
|
|||||||
# End-to-end test stack — used by .gitea/workflows/e2e.yml and by
|
|
||||||
# operators who want to run the Playwright suite locally.
|
|
||||||
#
|
|
||||||
# Three services:
|
|
||||||
# * server — restic-manager built from the working tree
|
|
||||||
# * agent — restic-manager agent built from the working tree
|
|
||||||
# (announces; Playwright accepts it during the test)
|
|
||||||
# * rest-server — the actual restic backend, sibling of the agent
|
|
||||||
#
|
|
||||||
# Run from the repo root:
|
|
||||||
# docker compose -f e2e/compose.e2e.yml up --build --abort-on-container-exit
|
|
||||||
|
|
||||||
services:
|
|
||||||
rest-server:
|
|
||||||
image: restic/rest-server:0.13.0
|
|
||||||
environment:
|
|
||||||
DATA_DIR: /data
|
|
||||||
OPTIONS: "--no-auth"
|
|
||||||
volumes:
|
|
||||||
- rest-data:/data
|
|
||||||
networks: [rmnet]
|
|
||||||
|
|
||||||
server:
|
|
||||||
build:
|
|
||||||
context: ..
|
|
||||||
dockerfile: deploy/Dockerfile.server
|
|
||||||
args:
|
|
||||||
VERSION: e2e
|
|
||||||
environment:
|
|
||||||
RM_LISTEN: ":8080"
|
|
||||||
RM_DATA_DIR: "/data"
|
|
||||||
RM_BASE_URL: "http://server:8080"
|
|
||||||
RM_COOKIE_SECURE: "false"
|
|
||||||
# Bind the metrics endpoint loose for the test, so one of the
|
|
||||||
# Playwright assertions can exercise it.
|
|
||||||
RM_METRICS_TRUSTED_CIDR: "0.0.0.0/0"
|
|
||||||
volumes:
|
|
||||||
- server-data:/data
|
|
||||||
ports:
|
|
||||||
- "127.0.0.1:8080:8080"
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "/usr/local/bin/restic-manager-server", "--version"]
|
|
||||||
interval: 2s
|
|
||||||
timeout: 2s
|
|
||||||
retries: 30
|
|
||||||
networks: [rmnet]
|
|
||||||
|
|
||||||
agent:
|
|
||||||
build:
|
|
||||||
context: ..
|
|
||||||
dockerfile: e2e/Dockerfile.agent
|
|
||||||
args:
|
|
||||||
VERSION: e2e
|
|
||||||
environment:
|
|
||||||
RM_SERVER: "http://server:8080"
|
|
||||||
depends_on:
|
|
||||||
- server
|
|
||||||
volumes:
|
|
||||||
# Source paths the agent backs up. Compose pre-populates this
|
|
||||||
# with a few files so the snapshot list isn't empty.
|
|
||||||
- source-data:/source
|
|
||||||
- agent-config:/etc/restic-manager
|
|
||||||
- agent-state:/var/lib/restic-manager-agent
|
|
||||||
networks: [rmnet]
|
|
||||||
|
|
||||||
# Playwright test runner. Profile-gated so `compose up` doesn't
|
|
||||||
# start it; CI runs it via `compose run --rm playwright`. Lives on
|
|
||||||
# rmnet so it can reach the server via its compose-network DNS
|
|
||||||
# name rather than depending on host port-publish (which doesn't
|
|
||||||
# work on Gitea's container-based runners).
|
|
||||||
playwright:
|
|
||||||
profiles: [test]
|
|
||||||
build:
|
|
||||||
context: ..
|
|
||||||
dockerfile: e2e/Dockerfile.playwright
|
|
||||||
environment:
|
|
||||||
RM_BASE_URL: "http://server:8080"
|
|
||||||
RM_BOOTSTRAP_TOKEN: "${RM_BOOTSTRAP_TOKEN:-}"
|
|
||||||
volumes:
|
|
||||||
- ./playwright/playwright-report:/work/playwright-report
|
|
||||||
- ./playwright/test-results:/work/test-results
|
|
||||||
depends_on:
|
|
||||||
- server
|
|
||||||
- agent
|
|
||||||
networks: [rmnet]
|
|
||||||
|
|
||||||
# One-shot init container that drops a couple of files into the
|
|
||||||
# source volume so backups have something to snapshot.
|
|
||||||
source-fixture:
|
|
||||||
image: alpine:3.20
|
|
||||||
command: >
|
|
||||||
sh -c 'mkdir -p /source && echo "hello world" > /source/hello.txt &&
|
|
||||||
echo "another file" > /source/two.txt && sleep 0.2'
|
|
||||||
volumes:
|
|
||||||
- source-data:/source
|
|
||||||
networks: [rmnet]
|
|
||||||
restart: "no"
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
server-data:
|
|
||||||
rest-data:
|
|
||||||
source-data:
|
|
||||||
agent-config:
|
|
||||||
agent-state:
|
|
||||||
|
|
||||||
networks:
|
|
||||||
rmnet:
|
|
||||||
driver: bridge
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "restic-manager-e2e",
|
|
||||||
"version": "0.0.0",
|
|
||||||
"private": true,
|
|
||||||
"type": "module",
|
|
||||||
"scripts": {
|
|
||||||
"test": "playwright test",
|
|
||||||
"test:headed": "playwright test --headed",
|
|
||||||
"test:debug": "PWDEBUG=1 playwright test"
|
|
||||||
},
|
|
||||||
"devDependencies": {
|
|
||||||
"@playwright/test": "1.59.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
import { defineConfig, devices } from '@playwright/test';
|
|
||||||
|
|
||||||
// Single-target Chromium config: the e2e suite is narrow (smoke
|
|
||||||
// the production-shaped flow against the docker-compose stack).
|
|
||||||
// Cross-browser matrix doesn't add signal — what we're verifying is
|
|
||||||
// the server's HTML and the agent's WebSocket handshake, neither of
|
|
||||||
// which depends on browser engine.
|
|
||||||
|
|
||||||
const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
|
|
||||||
|
|
||||||
export default defineConfig({
|
|
||||||
testDir: './tests',
|
|
||||||
timeout: 60_000,
|
|
||||||
expect: { timeout: 10_000 },
|
|
||||||
fullyParallel: false,
|
|
||||||
retries: process.env.CI ? 1 : 0,
|
|
||||||
workers: 1,
|
|
||||||
reporter: [['list'], ['html', { open: 'never' }]],
|
|
||||||
use: {
|
|
||||||
baseURL,
|
|
||||||
trace: 'retain-on-failure',
|
|
||||||
screenshot: 'only-on-failure',
|
|
||||||
video: 'retain-on-failure',
|
|
||||||
},
|
|
||||||
projects: [
|
|
||||||
{
|
|
||||||
name: 'chromium',
|
|
||||||
use: { ...devices['Desktop Chrome'] },
|
|
||||||
},
|
|
||||||
],
|
|
||||||
});
|
|
||||||
@@ -1,114 +0,0 @@
|
|||||||
// Helpers used by every test. The shape favours the JSON API for
|
|
||||||
// reads + accept/dispatch (deterministic, easy to assert) and the
|
|
||||||
// browser for human-facing surfaces (login form, dashboard render).
|
|
||||||
|
|
||||||
import { APIRequestContext, expect, Page } from '@playwright/test';
|
|
||||||
|
|
||||||
export const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
|
|
||||||
|
|
||||||
export interface HostJSON {
|
|
||||||
id: string;
|
|
||||||
name: string;
|
|
||||||
status: string;
|
|
||||||
last_backup_status?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function readBootstrapToken(): Promise<string> {
|
|
||||||
const tok = process.env.RM_BOOTSTRAP_TOKEN;
|
|
||||||
if (!tok) {
|
|
||||||
throw new Error('RM_BOOTSTRAP_TOKEN not set — the harness scrapes it from server logs');
|
|
||||||
}
|
|
||||||
return tok;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function bootstrapAdmin(
|
|
||||||
request: APIRequestContext,
|
|
||||||
{
|
|
||||||
username = 'admin',
|
|
||||||
password = 'e2e-test-password-1234',
|
|
||||||
}: { username?: string; password?: string } = {},
|
|
||||||
): Promise<{ username: string; password: string }> {
|
|
||||||
const token = await readBootstrapToken();
|
|
||||||
const res = await request.post(`${baseURL}/api/bootstrap`, {
|
|
||||||
data: { token, username, password },
|
|
||||||
});
|
|
||||||
if (!res.ok() && res.status() !== 409 /* already bootstrapped */) {
|
|
||||||
throw new Error(`bootstrap: ${res.status()} ${await res.text()}`);
|
|
||||||
}
|
|
||||||
return { username, password };
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function loginViaUI(page: Page, username: string, password: string): Promise<void> {
|
|
||||||
await page.goto(`${baseURL}/login`);
|
|
||||||
await page.locator('#login-username').fill(username);
|
|
||||||
await page.locator('#login-password').fill(password);
|
|
||||||
await Promise.all([
|
|
||||||
page.waitForURL(new RegExp(`^${baseURL}/?$`)),
|
|
||||||
page.locator('form[action="/login"] button[type="submit"]').click(),
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Polls the dashboard until a pending host card is visible, then
|
|
||||||
* extracts its pending-id from the inline accept form's action URL.
|
|
||||||
*/
|
|
||||||
export async function waitForPendingHostID(page: Page): Promise<string> {
|
|
||||||
const formLocator = page.locator('form[action^="/api/pending-hosts/"][action$="/accept"]').first();
|
|
||||||
await expect(formLocator).toBeVisible({ timeout: 60_000 });
|
|
||||||
const action = await formLocator.getAttribute('action');
|
|
||||||
if (!action) throw new Error('pending host form has no action attribute');
|
|
||||||
const m = action.match(/\/api\/pending-hosts\/([^/]+)\/accept/);
|
|
||||||
if (!m) throw new Error(`unexpected action URL: ${action}`);
|
|
||||||
return m[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function acceptPending(
|
|
||||||
request: APIRequestContext,
|
|
||||||
cookie: string,
|
|
||||||
pendingID: string,
|
|
||||||
repo: { url: string; username?: string; password: string },
|
|
||||||
): Promise<void> {
|
|
||||||
const res = await request.post(`${baseURL}/api/pending-hosts/${pendingID}/accept`, {
|
|
||||||
headers: { cookie, 'content-type': 'application/json' },
|
|
||||||
data: {
|
|
||||||
repo_url: repo.url,
|
|
||||||
repo_username: repo.username ?? '',
|
|
||||||
repo_password: repo.password,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
if (!res.ok()) {
|
|
||||||
throw new Error(`accept: ${res.status()} ${await res.text()}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function listHosts(request: APIRequestContext, cookie: string): Promise<HostJSON[]> {
|
|
||||||
const res = await request.get(`${baseURL}/api/hosts`, { headers: { cookie } });
|
|
||||||
if (!res.ok()) throw new Error(`list hosts: ${res.status()} ${await res.text()}`);
|
|
||||||
const body = (await res.json()) as { items?: HostJSON[]; hosts?: HostJSON[] };
|
|
||||||
return body.items ?? body.hosts ?? [];
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function waitForHostStatus(
|
|
||||||
request: APIRequestContext,
|
|
||||||
cookie: string,
|
|
||||||
matcher: (h: HostJSON) => boolean,
|
|
||||||
timeoutMs = 60_000,
|
|
||||||
): Promise<HostJSON> {
|
|
||||||
const deadline = Date.now() + timeoutMs;
|
|
||||||
let last: HostJSON | undefined;
|
|
||||||
while (Date.now() < deadline) {
|
|
||||||
const hosts = await listHosts(request, cookie);
|
|
||||||
const hit = hosts.find(matcher);
|
|
||||||
if (hit) return hit;
|
|
||||||
last = hosts[0];
|
|
||||||
await new Promise((r) => setTimeout(r, 1_000));
|
|
||||||
}
|
|
||||||
throw new Error(`waitForHostStatus: timeout. Last seen: ${JSON.stringify(last)}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function getSessionCookie(page: Page): Promise<string> {
|
|
||||||
const cookies = await page.context().cookies();
|
|
||||||
const c = cookies.find((c) => c.name === 'rm_session');
|
|
||||||
if (!c) throw new Error('rm_session cookie not set after login');
|
|
||||||
return `${c.name}=${c.value}`;
|
|
||||||
}
|
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
// End-to-end smoke: bootstrap → accept pending host → run backup → see succeeded.
|
|
||||||
//
|
|
||||||
// The compose stack stands up a server, a sibling rest-server, and an
|
|
||||||
// agent in announce-and-approve mode. This test drives the operator
|
|
||||||
// path through the UI (login + dashboard) and the API
|
|
||||||
// (accept + run-now + poll for terminal) — UI for the human surfaces,
|
|
||||||
// API for the deterministic ones.
|
|
||||||
|
|
||||||
import { test, expect } from '@playwright/test';
|
|
||||||
import {
|
|
||||||
baseURL,
|
|
||||||
bootstrapAdmin,
|
|
||||||
loginViaUI,
|
|
||||||
waitForPendingHostID,
|
|
||||||
acceptPending,
|
|
||||||
waitForHostStatus,
|
|
||||||
getSessionCookie,
|
|
||||||
} from './lib/server';
|
|
||||||
|
|
||||||
test.describe('smoke: enrol-via-announce → backup', () => {
|
|
||||||
test('happy path completes in under a minute', async ({ page, request }) => {
|
|
||||||
const { username, password } = await bootstrapAdmin(request);
|
|
||||||
await loginViaUI(page, username, password);
|
|
||||||
|
|
||||||
// Dashboard renders.
|
|
||||||
await expect(page.locator('main')).toContainText(/host|fleet|pending/i, { timeout: 10_000 });
|
|
||||||
|
|
||||||
// Pending host appears (the agent container has been
|
|
||||||
// announcing since startup).
|
|
||||||
const pendingID = await waitForPendingHostID(page);
|
|
||||||
const cookie = await getSessionCookie(page);
|
|
||||||
|
|
||||||
// Accept with the rest-server creds. compose's rest-server runs
|
|
||||||
// --no-auth, so any credentials work; restic still demands a
|
|
||||||
// password to encrypt the repo.
|
|
||||||
await acceptPending(request, cookie, pendingID, {
|
|
||||||
url: 'rest:http://rest-server:8000/',
|
|
||||||
password: 'e2e-repo-password',
|
|
||||||
});
|
|
||||||
|
|
||||||
// Wait for the host to come online + auto-init to land.
|
|
||||||
const onlineHost = await waitForHostStatus(
|
|
||||||
request, cookie,
|
|
||||||
(h) => h.status === 'online',
|
|
||||||
60_000,
|
|
||||||
);
|
|
||||||
expect(onlineHost.id).toBeTruthy();
|
|
||||||
|
|
||||||
// Trigger a backup via the UI form-post (HX-Redirect to /jobs/{id}).
|
|
||||||
await page.goto(`${baseURL}/hosts/${onlineHost.id}`);
|
|
||||||
await Promise.all([
|
|
||||||
page.waitForURL(/\/jobs\//),
|
|
||||||
page.locator('form[action$="/run-backup"] button[type="submit"]').first().click(),
|
|
||||||
]);
|
|
||||||
|
|
||||||
// Wait for the host's last_backup_status to flip to 'succeeded'.
|
|
||||||
// The job page itself is harder to assert on (it uses
|
|
||||||
// server-pushed updates and a reload-on-finish pattern); the
|
|
||||||
// host record is the source of truth and is what the dashboard
|
|
||||||
// surfaces.
|
|
||||||
const finishedHost = await waitForHostStatus(
|
|
||||||
request, cookie,
|
|
||||||
(h) => h.id === onlineHost.id && h.last_backup_status === 'succeeded',
|
|
||||||
120_000,
|
|
||||||
);
|
|
||||||
expect(finishedHost.last_backup_status).toBe('succeeded');
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test.describe('smoke: scrape /metrics', () => {
|
|
||||||
// The /metrics endpoint is documented (RM_METRICS_TOKEN /
|
|
||||||
// RM_METRICS_TRUSTED_CIDR, gauges rm_hosts_total / rm_build_info)
|
|
||||||
// but not yet implemented in the server. Skipping until the
|
|
||||||
// Prometheus exposition lands; tracked separately from this
|
|
||||||
// e2e harness.
|
|
||||||
test.skip('metrics endpoint exposes the host gauge', async ({ request }) => {
|
|
||||||
const res = await request.get(`${baseURL}/metrics`);
|
|
||||||
expect(res.status()).toBe(200);
|
|
||||||
const body = await res.text();
|
|
||||||
expect(body).toContain('rm_hosts_total');
|
|
||||||
expect(body).toContain('rm_build_info{');
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,100 +0,0 @@
|
|||||||
// Package updater carries the agent's self-update logic.
|
|
||||||
//
|
|
||||||
// The flow is operator-driven: the server dispatches a command.update
|
|
||||||
// WS envelope, the agent fetches a fresh binary from the server's
|
|
||||||
// /agent/binary endpoint, atomic-renames it over the running binary
|
|
||||||
// (Linux) or hands off to a detached helper script (Windows), and
|
|
||||||
// exits cleanly so the service manager restarts under the new
|
|
||||||
// binary. See docs/superpowers/specs/2026-05-06-p6-01-02-...
|
|
||||||
//
|
|
||||||
// Platform-specific code is build-tagged into updater_unix.go /
|
|
||||||
// updater_windows.go. This file holds the shared HTTP fetch + path
|
|
||||||
// helpers + the test seam.
|
|
||||||
package updater
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"net/http"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
// fetch downloads the new binary into <binaryPath>.new, fsyncs, chmods.
|
|
||||||
// Returns the path of the staged file (always binaryPath + ".new").
|
|
||||||
func fetch(ctx context.Context, serverURL, binaryPath string) (string, error) {
|
|
||||||
url := fmt.Sprintf("%s/agent/binary?os=%s&arch=%s", serverURL, runtime.GOOS, runtime.GOARCH)
|
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
c := &http.Client{Timeout: 5 * time.Minute}
|
|
||||||
res, err := c.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
defer func() { _ = res.Body.Close() }()
|
|
||||||
if res.StatusCode != http.StatusOK {
|
|
||||||
return "", fmt.Errorf("agent binary fetch: %s", res.Status)
|
|
||||||
}
|
|
||||||
|
|
||||||
stagePath := binaryPath + ".new"
|
|
||||||
f, err := os.OpenFile(stagePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if _, copyErr := io.Copy(f, res.Body); copyErr != nil {
|
|
||||||
_ = f.Close()
|
|
||||||
_ = os.Remove(stagePath)
|
|
||||||
return "", copyErr
|
|
||||||
}
|
|
||||||
if syncErr := f.Sync(); syncErr != nil {
|
|
||||||
_ = f.Close()
|
|
||||||
_ = os.Remove(stagePath)
|
|
||||||
return "", syncErr
|
|
||||||
}
|
|
||||||
if closeErr := f.Close(); closeErr != nil {
|
|
||||||
_ = os.Remove(stagePath)
|
|
||||||
return "", closeErr
|
|
||||||
}
|
|
||||||
if err := os.Chmod(stagePath, 0o755); err != nil {
|
|
||||||
_ = os.Remove(stagePath)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return stagePath, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// resolveOwnBinary returns the absolute path of the running binary.
|
|
||||||
// Refuses /proc/self/exe — that's what os.Executable returns on some
|
|
||||||
// systems but the path can't be renamed across.
|
|
||||||
func resolveOwnBinary() (string, error) {
|
|
||||||
p, err := os.Executable()
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
abs, err := filepath.Abs(p)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if abs == "/proc/self/exe" {
|
|
||||||
return "", fmt.Errorf("cannot resolve own binary path (/proc/self/exe)")
|
|
||||||
}
|
|
||||||
return abs, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// UpdateForTest is the platform-neutral test seam. In production the
|
|
||||||
// platform-specific Update fetches, swaps, then exits the process.
|
|
||||||
// UpdateForTest stops short of the exit so unit tests can assert on
|
|
||||||
// file state.
|
|
||||||
func UpdateForTest(serverURL, binaryPath string) error {
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
stage, err := fetch(ctx, serverURL, binaryPath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return swap(stage, binaryPath)
|
|
||||||
}
|
|
||||||
@@ -1,87 +0,0 @@
|
|||||||
//go:build !windows
|
|
||||||
|
|
||||||
package updater
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"io"
|
|
||||||
"net/http"
|
|
||||||
"net/http/httptest"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
// TestUpdate_LinuxAtomicSwap stages a fake "running binary" file, runs
|
|
||||||
// UpdateForTest against a fake /agent/binary server, and asserts that
|
|
||||||
// the binary was swapped, .old preserves the previous bytes, and .new
|
|
||||||
// was renamed away.
|
|
||||||
func TestUpdate_LinuxAtomicSwap(t *testing.T) {
|
|
||||||
tmp := t.TempDir()
|
|
||||||
binPath := filepath.Join(tmp, "agent")
|
|
||||||
if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
newBytes := []byte("NEW BINARY CONTENTS")
|
|
||||||
|
|
||||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
if r.URL.Path != "/agent/binary" {
|
|
||||||
http.NotFound(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
gotOS, gotArch := r.URL.Query().Get("os"), r.URL.Query().Get("arch")
|
|
||||||
if gotOS != runtime.GOOS || gotArch != runtime.GOARCH {
|
|
||||||
t.Errorf("query mismatch: got os=%s arch=%s want %s/%s",
|
|
||||||
gotOS, gotArch, runtime.GOOS, runtime.GOARCH)
|
|
||||||
}
|
|
||||||
_, _ = io.Copy(w, bytes.NewReader(newBytes))
|
|
||||||
}))
|
|
||||||
defer srv.Close()
|
|
||||||
|
|
||||||
if err := UpdateForTest(srv.URL, binPath); err != nil {
|
|
||||||
t.Fatalf("update: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
got, err := os.ReadFile(binPath)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
if string(got) != string(newBytes) {
|
|
||||||
t.Fatalf("binary contents: got %q want %q", got, newBytes)
|
|
||||||
}
|
|
||||||
old, err := os.ReadFile(binPath + ".old")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("agent.old missing: %v", err)
|
|
||||||
}
|
|
||||||
if string(old) != "OLD" {
|
|
||||||
t.Fatalf("agent.old contents: got %q want %q", old, "OLD")
|
|
||||||
}
|
|
||||||
if _, err := os.Stat(binPath + ".new"); !os.IsNotExist(err) {
|
|
||||||
t.Fatalf("agent.new should be absent after swap, got err=%v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestUpdate_FetchHTTPError surfaces the server's status when the
|
|
||||||
// binary is not published for this os/arch.
|
|
||||||
func TestUpdate_FetchHTTPError(t *testing.T) {
|
|
||||||
tmp := t.TempDir()
|
|
||||||
binPath := filepath.Join(tmp, "agent")
|
|
||||||
if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
http.Error(w, `{"error":"binary_not_published"}`, http.StatusNotFound)
|
|
||||||
}))
|
|
||||||
defer srv.Close()
|
|
||||||
|
|
||||||
err := UpdateForTest(srv.URL, binPath)
|
|
||||||
if err == nil {
|
|
||||||
t.Fatal("expected error, got nil")
|
|
||||||
}
|
|
||||||
got, _ := os.ReadFile(binPath)
|
|
||||||
if string(got) != "OLD" {
|
|
||||||
t.Fatalf("binary should not have changed, got %q", got)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
//go:build !windows
|
|
||||||
|
|
||||||
package updater
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Update fetches the new binary, swaps it in, then exits so systemd
|
|
||||||
// restarts the process under the new binary. The caller should close
|
|
||||||
// the WS connection cleanly (so the server transitions the host to
|
|
||||||
// disconnected immediately rather than waiting for the heartbeat
|
|
||||||
// sweep) before invoking.
|
|
||||||
//
|
|
||||||
// Service-user assumption: the agent runs as root under the
|
|
||||||
// systemd-shipped unit, which can write the binary path directly.
|
|
||||||
// If the agent ever moves to a non-root service user, this breaks —
|
|
||||||
// would need a setuid helper or an out-of-process update service.
|
|
||||||
func Update(ctx context.Context, serverURL string) error {
|
|
||||||
binPath, err := resolveOwnBinary()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
stage, err := fetch(ctx, serverURL, binPath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if err := swap(stage, binPath); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
slog.Info("agent self-update: binary swapped, exiting for systemd restart",
|
|
||||||
"binary", binPath)
|
|
||||||
// Give logger / WS close-frame a moment to flush, then exit.
|
|
||||||
time.Sleep(200 * time.Millisecond)
|
|
||||||
os.Exit(0)
|
|
||||||
return nil // unreachable
|
|
||||||
}
|
|
||||||
|
|
||||||
// swap copies the running binary to <bin>.old (M1 — keep one revision
|
|
||||||
// back for hand-rolled rollback), then atomic-renames the staged
|
|
||||||
// binary into place. Linux supports rename-while-open so this works
|
|
||||||
// even though the running process holds the source open.
|
|
||||||
func swap(stagePath, binPath string) error {
|
|
||||||
src, err := os.Open(binPath)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("open running binary: %w", err)
|
|
||||||
}
|
|
||||||
defer func() { _ = src.Close() }()
|
|
||||||
dst, err := os.OpenFile(binPath+".old", os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("open .old: %w", err)
|
|
||||||
}
|
|
||||||
if _, err := io.Copy(dst, src); err != nil {
|
|
||||||
_ = dst.Close()
|
|
||||||
return fmt.Errorf("copy to .old: %w", err)
|
|
||||||
}
|
|
||||||
if err := dst.Sync(); err != nil {
|
|
||||||
_ = dst.Close()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if err := dst.Close(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if err := os.Rename(stagePath, binPath); err != nil {
|
|
||||||
return fmt.Errorf("rename .new over running binary: %w", err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
//go:build windows
|
|
||||||
|
|
||||||
package updater
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"os/exec"
|
|
||||||
"path/filepath"
|
|
||||||
"syscall"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
// helperScript is rendered with fmt.Sprintf, args order:
|
|
||||||
//
|
|
||||||
// %[1]s — running binary path (source for the .old copy)
|
|
||||||
// %[2]s — .old path
|
|
||||||
// %[3]s — staged .new path
|
|
||||||
// %[4]s — running binary path (rename target)
|
|
||||||
const helperScript = `@echo off
|
|
||||||
timeout /t 3 /nobreak >nul
|
|
||||||
copy /Y "%[1]s" "%[2]s"
|
|
||||||
sc stop restic-manager-agent
|
|
||||||
:wait
|
|
||||||
sc query restic-manager-agent | find "STOPPED" >nul
|
|
||||||
if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
|
|
||||||
move /Y "%[3]s" "%[4]s"
|
|
||||||
sc start restic-manager-agent
|
|
||||||
del "%%~f0"
|
|
||||||
`
|
|
||||||
|
|
||||||
// Update on Windows can't overwrite the running .exe in-process
|
|
||||||
// (exclusive file lock), so we stage the new binary, write a small
|
|
||||||
// detached helper script that waits, stops the service, swaps the
|
|
||||||
// binary, and starts the service, then exit cleanly. SCM treats
|
|
||||||
// clean exits after sc stop as intentional and does not auto-restart;
|
|
||||||
// the helper's final sc start handles that.
|
|
||||||
func Update(ctx context.Context, serverURL string) error {
|
|
||||||
binPath, err := resolveOwnBinary()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
stage, err := fetch(ctx, serverURL, binPath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
helperPath := filepath.Join(filepath.Dir(binPath), "agent-update.cmd")
|
|
||||||
body := fmt.Sprintf(helperScript, binPath, binPath+".old", stage, binPath)
|
|
||||||
if err := os.WriteFile(helperPath, []byte(body), 0o755); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
cmd := exec.Command("cmd.exe", "/c", helperPath)
|
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
|
||||||
HideWindow: true,
|
|
||||||
CreationFlags: 0x00000008 | 0x08000000, // DETACHED_PROCESS | CREATE_NO_WINDOW
|
|
||||||
}
|
|
||||||
if err := cmd.Start(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
slog.Info("agent self-update: helper spawned, exiting cleanly",
|
|
||||||
"binary", binPath, "helper", helperPath)
|
|
||||||
time.Sleep(200 * time.Millisecond)
|
|
||||||
os.Exit(0)
|
|
||||||
return nil // unreachable
|
|
||||||
}
|
|
||||||
|
|
||||||
// swap is unused on Windows — the helper script does the swap.
|
|
||||||
// Defined to satisfy the build (UpdateForTest references it).
|
|
||||||
func swap(_, _ string) error {
|
|
||||||
return fmt.Errorf("updater.swap not implemented on Windows; use the helper script via Update")
|
|
||||||
}
|
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
package alert
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Alert-kind constants for P6 self-update flows.
|
|
||||||
const (
|
|
||||||
// KindUpdateFailed is raised when an agent fails to come back with
|
|
||||||
// the expected version after a command.update dispatch (timeout or
|
|
||||||
// version-mismatch). Resolved by a subsequent matching hello.
|
|
||||||
KindUpdateFailed = "update_failed"
|
|
||||||
|
|
||||||
// KindFleetUpdateHalted is raised when the fleet-update worker
|
|
||||||
// stops mid-run because a host failed to update or went offline.
|
|
||||||
// Host-less alert (system-scoped). Manually resolved by an admin.
|
|
||||||
KindFleetUpdateHalted = "fleet_update_halted"
|
|
||||||
)
|
|
||||||
|
|
||||||
// RaiseUpdateFailed records a per-host update failure. dedupKey is the
|
|
||||||
// hostID so a re-dispatch on the same host touches the existing alert
|
|
||||||
// rather than spawning a duplicate.
|
|
||||||
func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) {
|
|
||||||
msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason)
|
|
||||||
e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ResolveUpdateFailed clears any open update_failed alert for hostID.
|
|
||||||
// Called from the WS hello path when the agent reconnects with the
|
|
||||||
// target version.
|
|
||||||
func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) {
|
|
||||||
e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when)
|
|
||||||
}
|
|
||||||
|
|
||||||
// RaiseFleetUpdateHalted is host-less — the fleet update is a
|
|
||||||
// system-level concept. We persist it via the dedicated host-less
|
|
||||||
// alert path so the alerts table's host_id column carries NULL.
|
|
||||||
func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) {
|
|
||||||
msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason)
|
|
||||||
id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if !didRaise {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
go e.hub.Dispatch(ctx, notification.Payload{
|
|
||||||
Event: notification.EventRaised,
|
|
||||||
AlertID: id,
|
|
||||||
Severity: "warning",
|
|
||||||
Kind: KindFleetUpdateHalted,
|
|
||||||
HostID: "",
|
|
||||||
HostName: "",
|
|
||||||
Message: msg,
|
|
||||||
RaisedAt: when,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@@ -63,7 +63,6 @@ const (
|
|||||||
JobUnlock JobKind = "unlock"
|
JobUnlock JobKind = "unlock"
|
||||||
JobRestore JobKind = "restore"
|
JobRestore JobKind = "restore"
|
||||||
JobDiff JobKind = "diff"
|
JobDiff JobKind = "diff"
|
||||||
JobUpdate JobKind = "update"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// JobStatus is the lifecycle state of a job.
|
// JobStatus is the lifecycle state of a job.
|
||||||
@@ -362,14 +361,13 @@ type ConfigUpdatePayload struct {
|
|||||||
BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"`
|
BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// CommandUpdatePayload carries no operational data — the agent
|
// AgentUpdateAvailablePayload — informational only; the agent does
|
||||||
// already knows its own os/arch and fetches from its configured
|
// NOT self-update. See spec.md §4.2 for the package-manager-based
|
||||||
// server URL via /agent/binary. JobID is the server-issued id of
|
// update model.
|
||||||
// the update job; the agent echoes it on log.stream lines so the
|
type AgentUpdateAvailablePayload struct {
|
||||||
// live job log captures pre-restart progress, then either exits
|
LatestVersion string `json:"latest_version"`
|
||||||
// (Linux) or hands off to a detached helper script (Windows).
|
PackageURL string `json:"package_url"` // apt repo / choco source
|
||||||
type CommandUpdatePayload struct {
|
Changelog string `json:"changelog,omitempty"`
|
||||||
JobID string `json:"job_id"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TreeListRequestPayload is the body of a tree.list RPC. Used by the
|
// TreeListRequestPayload is the body of a tree.list RPC. Used by the
|
||||||
|
|||||||
@@ -29,12 +29,12 @@ const (
|
|||||||
|
|
||||||
// Server → agent message types.
|
// Server → agent message types.
|
||||||
const (
|
const (
|
||||||
MsgCommandRun MessageType = "command.run"
|
MsgCommandRun MessageType = "command.run"
|
||||||
MsgCommandCancel MessageType = "command.cancel"
|
MsgCommandCancel MessageType = "command.cancel"
|
||||||
MsgScheduleSet MessageType = "schedule.set"
|
MsgScheduleSet MessageType = "schedule.set"
|
||||||
MsgConfigUpdate MessageType = "config.update"
|
MsgConfigUpdate MessageType = "config.update"
|
||||||
MsgCommandUpdate MessageType = "command.update"
|
MsgAgentUpdateAvail MessageType = "agent.update.available"
|
||||||
MsgTreeList MessageType = "tree.list" // sync RPC: list a snapshot's children
|
MsgTreeList MessageType = "tree.list" // sync RPC: list a snapshot's children
|
||||||
)
|
)
|
||||||
|
|
||||||
// Envelope is the framing for every WS message in either direction.
|
// Envelope is the framing for every WS message in either direction.
|
||||||
|
|||||||
@@ -1,221 +0,0 @@
|
|||||||
// Package fleetupdate drives a rolling, sequential agent self-update
|
|
||||||
// over a list of hosts. One worker goroutine per Start() call (gated
|
|
||||||
// at the store layer to at-most-one-running-fleet-update).
|
|
||||||
package fleetupdate
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/oklog/ulid/v2"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Hub is the slim "is this host connected?" surface.
|
|
||||||
type Hub interface {
|
|
||||||
Connected(hostID string) bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dispatcher sends one command.update envelope. The implementer also
|
|
||||||
// creates the jobs row, writes audit, and registers with the update
|
|
||||||
// watcher. Pre-checks are the dispatcher's responsibility — the worker
|
|
||||||
// passes through whatever error it returns.
|
|
||||||
type Dispatcher interface {
|
|
||||||
DispatchUpdate(ctx context.Context, hostID string, actorUserID string) (jobID string, code string, err error)
|
|
||||||
}
|
|
||||||
|
|
||||||
// AlertRaiser is the slim view of the alert engine's host-less raise
|
|
||||||
// path. Used to emit fleet_update_halted on first failure.
|
|
||||||
type AlertRaiser interface {
|
|
||||||
RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Worker is the long-lived fleet-update orchestrator. There is at most
|
|
||||||
// one *running* fleet update at a time (enforced by the store).
|
|
||||||
type Worker struct {
|
|
||||||
store *store.Store
|
|
||||||
hub Hub
|
|
||||||
disp Dispatcher
|
|
||||||
alerts AlertRaiser
|
|
||||||
|
|
||||||
// targetVersion is the version every dispatched agent is expected
|
|
||||||
// to come back with. Captured at Start time to avoid drift.
|
|
||||||
targetVersion string
|
|
||||||
|
|
||||||
// pollPeriod controls the cadence at which the worker re-reads the
|
|
||||||
// host row to check for the version transition. Exposed for tests.
|
|
||||||
pollPeriod time.Duration
|
|
||||||
// hostTimeout bounds how long the worker waits for one host to
|
|
||||||
// reach the target version before halting.
|
|
||||||
hostTimeout time.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewWorker builds an unstarted worker. targetVersion is set on each
|
|
||||||
// Start call; the values here are defaults.
|
|
||||||
func NewWorker(st *store.Store, hub Hub, disp Dispatcher, alerts AlertRaiser) *Worker {
|
|
||||||
return &Worker{
|
|
||||||
store: st,
|
|
||||||
hub: hub,
|
|
||||||
disp: disp,
|
|
||||||
alerts: alerts,
|
|
||||||
pollPeriod: 1 * time.Second,
|
|
||||||
hostTimeout: 95 * time.Second,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start creates the parent + child rows, then spawns the per-host
|
|
||||||
// worker goroutine. Returns the new fleet_update_id on success.
|
|
||||||
// store.ErrFleetUpdateRunning bubbles up unchanged.
|
|
||||||
func (w *Worker) Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error) {
|
|
||||||
if userID == "" || targetVersion == "" {
|
|
||||||
return "", errors.New("fleetupdate: userID and targetVersion required")
|
|
||||||
}
|
|
||||||
if len(hostIDs) == 0 {
|
|
||||||
return "", errors.New("fleetupdate: at least one host required")
|
|
||||||
}
|
|
||||||
fuID := ulid.Make().String()
|
|
||||||
now := time.Now().UTC()
|
|
||||||
if err := w.store.CreateFleetUpdate(ctx, store.FleetUpdate{
|
|
||||||
ID: fuID,
|
|
||||||
StartedAt: now,
|
|
||||||
StartedByUserID: userID,
|
|
||||||
TargetVersion: targetVersion,
|
|
||||||
Status: "running",
|
|
||||||
}, hostIDs); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
// The goroutine outlives the request that started it; carry a
|
|
||||||
// detached context so an HTTP-handler ctx cancel doesn't abort
|
|
||||||
// the long roll.
|
|
||||||
bg := context.WithoutCancel(ctx)
|
|
||||||
go w.run(bg, fuID, userID, targetVersion)
|
|
||||||
return fuID, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Cancel marks the fleet update cancelled. The running goroutine
|
|
||||||
// observes the new status on its next pre-check and exits without
|
|
||||||
// dispatching further hosts. The currently-dispatched job is left to
|
|
||||||
// finish on its own — cancelling agent-side is out of scope for v1.
|
|
||||||
func (w *Worker) Cancel(ctx context.Context, fuID string) error {
|
|
||||||
return w.store.CancelFleetUpdate(ctx, fuID, time.Now().UTC())
|
|
||||||
}
|
|
||||||
|
|
||||||
// run is the per-host loop. Halts on first failure; emits one alert
|
|
||||||
// on transition.
|
|
||||||
func (w *Worker) run(ctx context.Context, fuID, userID, targetVersion string) {
|
|
||||||
w.targetVersion = targetVersion
|
|
||||||
|
|
||||||
for {
|
|
||||||
// Check the parent row's status — picks up Cancel.
|
|
||||||
fu, err := w.store.ActiveFleetUpdate(ctx)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("fleetupdate: read active", "fu_id", fuID, "err", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if fu == nil || fu.ID != fuID {
|
|
||||||
// Cancelled, halted, or completed externally. Done.
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
pending, err := w.store.ListPendingFleetUpdateHosts(ctx, fuID)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("fleetupdate: list pending", "fu_id", fuID, "err", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if len(pending) == 0 {
|
|
||||||
now := time.Now().UTC()
|
|
||||||
if err := w.store.CompleteFleetUpdate(ctx, fuID, now); err != nil {
|
|
||||||
slog.Warn("fleetupdate: complete", "fu_id", fuID, "err", err)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
next := pending[0]
|
|
||||||
w.processHost(ctx, fuID, userID, next)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// processHost handles one host slot. Marks it skipped, succeeded, or
|
|
||||||
// failed (and halts the fleet on failure).
|
|
||||||
func (w *Worker) processHost(ctx context.Context, fuID, userID string, slot store.FleetUpdateHost) {
|
|
||||||
hostID := slot.HostID
|
|
||||||
_ = w.store.SetFleetUpdateCurrentHost(ctx, fuID, hostID)
|
|
||||||
|
|
||||||
// Pre-flight: re-read the host. The dispatch path repeats most of
|
|
||||||
// these checks but doing them up-front lets us emit the right
|
|
||||||
// per-host status (skipped vs failed) without consuming a job row.
|
|
||||||
host, err := w.store.GetHost(ctx, hostID)
|
|
||||||
if err != nil || host == nil {
|
|
||||||
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "host not found", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if host.AgentVersion != "" && host.AgentVersion == w.targetVersion {
|
|
||||||
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "already at target version", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if !w.hub.Connected(hostID) {
|
|
||||||
reason := fmt.Sprintf("host went offline: %s", hostID)
|
|
||||||
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, "")
|
|
||||||
w.halt(ctx, fuID, reason)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dispatch.
|
|
||||||
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "running", "", "")
|
|
||||||
jobID, code, err := w.disp.DispatchUpdate(ctx, hostID, userID)
|
|
||||||
if err != nil || code != "" {
|
|
||||||
reason := dispatchErrorReason(code, err)
|
|
||||||
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID)
|
|
||||||
w.halt(ctx, fuID, reason)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Poll until the host's recorded agent_version matches target, or
|
|
||||||
// timeout.
|
|
||||||
deadline := time.Now().Add(w.hostTimeout)
|
|
||||||
for time.Now().Before(deadline) {
|
|
||||||
// Honour cancellation between polls.
|
|
||||||
fu, err := w.store.ActiveFleetUpdate(ctx)
|
|
||||||
if err == nil && (fu == nil || fu.ID != fuID) {
|
|
||||||
// Cancelled mid-host; leave the slot in 'running' for the
|
|
||||||
// admin to inspect. No further dispatches.
|
|
||||||
return
|
|
||||||
}
|
|
||||||
time.Sleep(w.pollPeriod)
|
|
||||||
h, err := w.store.GetHost(ctx, hostID)
|
|
||||||
if err == nil && h != nil && h.AgentVersion == w.targetVersion {
|
|
||||||
if err := w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "succeeded", "", jobID); err != nil {
|
|
||||||
slog.Warn("fleetupdate: set succeeded", "fu_id", fuID, "host_id", hostID, "err", err)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reason := fmt.Sprintf("timeout waiting for %s to reach %s", hostID, w.targetVersion)
|
|
||||||
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID)
|
|
||||||
w.halt(ctx, fuID, reason)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w *Worker) halt(ctx context.Context, fuID, reason string) {
|
|
||||||
now := time.Now().UTC()
|
|
||||||
if err := w.store.HaltFleetUpdate(ctx, fuID, reason, now); err != nil {
|
|
||||||
slog.Warn("fleetupdate: halt", "fu_id", fuID, "err", err)
|
|
||||||
}
|
|
||||||
if w.alerts != nil {
|
|
||||||
w.alerts.RaiseFleetUpdateHalted(ctx, fuID, reason, now)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func dispatchErrorReason(code string, err error) string {
|
|
||||||
if code != "" {
|
|
||||||
return "dispatch failed: " + code
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return err.Error()
|
|
||||||
}
|
|
||||||
return "dispatch failed"
|
|
||||||
}
|
|
||||||
@@ -1,344 +0,0 @@
|
|||||||
package fleetupdate
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"path/filepath"
|
|
||||||
"sync"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/oklog/ulid/v2"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
)
|
|
||||||
|
|
||||||
type fakeHub struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
online map[string]bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *fakeHub) Connected(hostID string) bool {
|
|
||||||
f.mu.Lock()
|
|
||||||
defer f.mu.Unlock()
|
|
||||||
return f.online[hostID]
|
|
||||||
}
|
|
||||||
|
|
||||||
type fakeDispatcher struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
calls []string // host IDs
|
|
||||||
// after dispatch, set the host's agent_version to this on the
|
|
||||||
// store so the worker observes the version transition.
|
|
||||||
st *store.Store
|
|
||||||
target string
|
|
||||||
delayMS int
|
|
||||||
failOnHost map[string]string // host → error code
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *fakeDispatcher) DispatchUpdate(ctx context.Context, hostID, _ string) (string, string, error) {
|
|
||||||
f.mu.Lock()
|
|
||||||
f.calls = append(f.calls, hostID)
|
|
||||||
if code, ok := f.failOnHost[hostID]; ok {
|
|
||||||
f.mu.Unlock()
|
|
||||||
return "", code, nil
|
|
||||||
}
|
|
||||||
st := f.st
|
|
||||||
target := f.target
|
|
||||||
delay := f.delayMS
|
|
||||||
f.mu.Unlock()
|
|
||||||
|
|
||||||
jobID := ulid.Make().String()
|
|
||||||
if st != nil {
|
|
||||||
_ = st.CreateJob(context.Background(), store.Job{
|
|
||||||
ID: jobID, HostID: hostID, Kind: "update",
|
|
||||||
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if st != nil && target != "" {
|
|
||||||
go func() {
|
|
||||||
if delay > 0 {
|
|
||||||
time.Sleep(time.Duration(delay) * time.Millisecond)
|
|
||||||
}
|
|
||||||
_ = st.MarkHostHello(context.Background(), hostID, target, "0.17", api.CurrentProtocolVersion, time.Now().UTC())
|
|
||||||
}()
|
|
||||||
}
|
|
||||||
return jobID, "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type recAlert struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
reasons []string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *recAlert) RaiseFleetUpdateHalted(_ context.Context, _ string, reason string, _ time.Time) {
|
|
||||||
r.mu.Lock()
|
|
||||||
r.reasons = append(r.reasons, reason)
|
|
||||||
r.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func openStore(t *testing.T) *store.Store {
|
|
||||||
t.Helper()
|
|
||||||
dir := t.TempDir()
|
|
||||||
st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("open: %v", err)
|
|
||||||
}
|
|
||||||
t.Cleanup(func() { _ = st.Close() })
|
|
||||||
return st
|
|
||||||
}
|
|
||||||
|
|
||||||
func mustCreateAdmin(t *testing.T, st *store.Store) string {
|
|
||||||
t.Helper()
|
|
||||||
uid := ulid.Make().String()
|
|
||||||
if err := st.CreateUser(context.Background(), store.User{
|
|
||||||
ID: uid, Username: "u-" + uid[:6],
|
|
||||||
PasswordHash: "x", Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
|
|
||||||
}); err != nil {
|
|
||||||
t.Fatalf("user: %v", err)
|
|
||||||
}
|
|
||||||
return uid
|
|
||||||
}
|
|
||||||
|
|
||||||
func mustCreateHost(t *testing.T, st *store.Store, name, version string) string {
|
|
||||||
t.Helper()
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
if err := st.CreateHost(context.Background(), store.Host{
|
|
||||||
ID: hostID, Name: name, OS: "linux", Arch: "amd64",
|
|
||||||
EnrolledAt: time.Now().UTC(),
|
|
||||||
}, "deadbeef-"+hostID, ""); err != nil {
|
|
||||||
t.Fatalf("host: %v", err)
|
|
||||||
}
|
|
||||||
if version != "" {
|
|
||||||
if err := st.MarkHostHello(context.Background(), hostID, version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
|
||||||
t.Fatalf("hello: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return hostID
|
|
||||||
}
|
|
||||||
|
|
||||||
func waitForStatus(t *testing.T, st *store.Store, fuID, want string, timeout time.Duration) *store.FleetUpdate {
|
|
||||||
t.Helper()
|
|
||||||
deadline := time.Now().Add(timeout)
|
|
||||||
for time.Now().Before(deadline) {
|
|
||||||
fu, _, err := st.GetFleetUpdate(context.Background(), fuID)
|
|
||||||
if err == nil && fu != nil && fu.Status == want {
|
|
||||||
return fu
|
|
||||||
}
|
|
||||||
time.Sleep(20 * time.Millisecond)
|
|
||||||
}
|
|
||||||
t.Fatalf("status never reached %q", want)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestWorkerTwoHostsBothSucceed(t *testing.T) {
|
|
||||||
st := openStore(t)
|
|
||||||
uid := mustCreateAdmin(t, st)
|
|
||||||
h1 := mustCreateHost(t, st, "h1", "v0")
|
|
||||||
h2 := mustCreateHost(t, st, "h2", "v0")
|
|
||||||
|
|
||||||
hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
|
|
||||||
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 30}
|
|
||||||
alerts := &recAlert{}
|
|
||||||
w := NewWorker(st, hub, disp, alerts)
|
|
||||||
w.pollPeriod = 20 * time.Millisecond
|
|
||||||
w.hostTimeout = 2 * time.Second
|
|
||||||
|
|
||||||
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("start: %v", err)
|
|
||||||
}
|
|
||||||
waitForStatus(t, st, fuID, "completed", 5*time.Second)
|
|
||||||
_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
|
|
||||||
for _, h := range hosts {
|
|
||||||
if h.Status != "succeeded" {
|
|
||||||
t.Errorf("host %s status %q want succeeded", h.HostID, h.Status)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if n := len(alerts.reasons); n != 0 {
|
|
||||||
t.Errorf("unexpected halt alert: %v", alerts.reasons)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestWorkerSecondHostTimesOutHalts(t *testing.T) {
|
|
||||||
st := openStore(t)
|
|
||||||
uid := mustCreateAdmin(t, st)
|
|
||||||
h1 := mustCreateHost(t, st, "h1", "v0")
|
|
||||||
h2 := mustCreateHost(t, st, "h2", "v0")
|
|
||||||
h3 := mustCreateHost(t, st, "h3", "v0")
|
|
||||||
|
|
||||||
hub := &fakeHub{online: map[string]bool{h1: true, h2: true, h3: true}}
|
|
||||||
// h1 dispatches normally (transitions to v2). h2 dispatch returns
|
|
||||||
// success but never transitions.
|
|
||||||
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20, failOnHost: map[string]string{
|
|
||||||
h2: "", // not a code-failure; simulate by clearing target on this disp run
|
|
||||||
}}
|
|
||||||
// Actually: drop h2 from the auto-transition by faking with a
|
|
||||||
// per-host store setter. Easiest: subclass via a wrapper.
|
|
||||||
_ = disp
|
|
||||||
customDisp := &perHostDispatcher{base: disp, st: st, target: "v2", noTransition: map[string]bool{h2: true}}
|
|
||||||
|
|
||||||
alerts := &recAlert{}
|
|
||||||
w := NewWorker(st, hub, customDisp, alerts)
|
|
||||||
w.pollPeriod = 20 * time.Millisecond
|
|
||||||
w.hostTimeout = 200 * time.Millisecond
|
|
||||||
|
|
||||||
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2, h3})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("start: %v", err)
|
|
||||||
}
|
|
||||||
waitForStatus(t, st, fuID, "halted", 3*time.Second)
|
|
||||||
_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
|
|
||||||
gotStatus := map[string]string{}
|
|
||||||
for _, h := range hosts {
|
|
||||||
gotStatus[h.HostID] = h.Status
|
|
||||||
}
|
|
||||||
if gotStatus[h1] != "succeeded" {
|
|
||||||
t.Errorf("h1: %q", gotStatus[h1])
|
|
||||||
}
|
|
||||||
if gotStatus[h2] != "failed" {
|
|
||||||
t.Errorf("h2: %q", gotStatus[h2])
|
|
||||||
}
|
|
||||||
if gotStatus[h3] != "pending" {
|
|
||||||
t.Errorf("h3: %q", gotStatus[h3])
|
|
||||||
}
|
|
||||||
alerts.mu.Lock()
|
|
||||||
defer alerts.mu.Unlock()
|
|
||||||
if len(alerts.reasons) != 1 {
|
|
||||||
t.Errorf("alert reasons: %v", alerts.reasons)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// perHostDispatcher lets a test omit the auto-transition for selected
|
|
||||||
// hosts so we can simulate timeout.
|
|
||||||
type perHostDispatcher struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
base *fakeDispatcher
|
|
||||||
st *store.Store
|
|
||||||
target string
|
|
||||||
noTransition map[string]bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *perHostDispatcher) DispatchUpdate(_ context.Context, hostID, _ string) (string, string, error) {
|
|
||||||
p.mu.Lock()
|
|
||||||
skip := p.noTransition[hostID]
|
|
||||||
p.mu.Unlock()
|
|
||||||
jobID := ulid.Make().String()
|
|
||||||
_ = p.st.CreateJob(context.Background(), store.Job{
|
|
||||||
ID: jobID, HostID: hostID, Kind: "update",
|
|
||||||
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
|
||||||
})
|
|
||||||
if !skip {
|
|
||||||
go func() {
|
|
||||||
time.Sleep(20 * time.Millisecond)
|
|
||||||
_ = p.st.MarkHostHello(context.Background(), hostID, p.target, "0.17", api.CurrentProtocolVersion, time.Now().UTC())
|
|
||||||
}()
|
|
||||||
}
|
|
||||||
return jobID, "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestWorkerHostOfflineHalts(t *testing.T) {
|
|
||||||
st := openStore(t)
|
|
||||||
uid := mustCreateAdmin(t, st)
|
|
||||||
h1 := mustCreateHost(t, st, "h1", "v0")
|
|
||||||
h2 := mustCreateHost(t, st, "h2", "v0")
|
|
||||||
hub := &fakeHub{online: map[string]bool{h1: false, h2: true}}
|
|
||||||
disp := &fakeDispatcher{st: st, target: "v2"}
|
|
||||||
alerts := &recAlert{}
|
|
||||||
w := NewWorker(st, hub, disp, alerts)
|
|
||||||
w.pollPeriod = 20 * time.Millisecond
|
|
||||||
w.hostTimeout = 500 * time.Millisecond
|
|
||||||
|
|
||||||
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("start: %v", err)
|
|
||||||
}
|
|
||||||
waitForStatus(t, st, fuID, "halted", 2*time.Second)
|
|
||||||
_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
|
|
||||||
if hosts[0].Status != "failed" {
|
|
||||||
t.Errorf("h1 status: %q", hosts[0].Status)
|
|
||||||
}
|
|
||||||
if hosts[1].Status != "pending" {
|
|
||||||
t.Errorf("h2 status: %q", hosts[1].Status)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestWorkerAlreadyAtTargetSkipped(t *testing.T) {
|
|
||||||
st := openStore(t)
|
|
||||||
uid := mustCreateAdmin(t, st)
|
|
||||||
h1 := mustCreateHost(t, st, "h1", "v2")
|
|
||||||
h2 := mustCreateHost(t, st, "h2", "v0")
|
|
||||||
hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
|
|
||||||
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20}
|
|
||||||
alerts := &recAlert{}
|
|
||||||
w := NewWorker(st, hub, disp, alerts)
|
|
||||||
w.pollPeriod = 20 * time.Millisecond
|
|
||||||
w.hostTimeout = 2 * time.Second
|
|
||||||
|
|
||||||
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("start: %v", err)
|
|
||||||
}
|
|
||||||
waitForStatus(t, st, fuID, "completed", 4*time.Second)
|
|
||||||
_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
|
|
||||||
want := map[string]string{h1: "skipped", h2: "succeeded"}
|
|
||||||
for _, h := range hosts {
|
|
||||||
if h.Status != want[h.HostID] {
|
|
||||||
t.Errorf("host %s: got %q want %q", h.HostID, h.Status, want[h.HostID])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestWorkerCancelMidRun(t *testing.T) {
|
|
||||||
st := openStore(t)
|
|
||||||
uid := mustCreateAdmin(t, st)
|
|
||||||
h1 := mustCreateHost(t, st, "h1", "v0")
|
|
||||||
h2 := mustCreateHost(t, st, "h2", "v0")
|
|
||||||
hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
|
|
||||||
// h1's transition is delayed long enough that we can cancel
|
|
||||||
// before it lands; h2 should never be touched.
|
|
||||||
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 500}
|
|
||||||
alerts := &recAlert{}
|
|
||||||
w := NewWorker(st, hub, disp, alerts)
|
|
||||||
w.pollPeriod = 50 * time.Millisecond
|
|
||||||
w.hostTimeout = 5 * time.Second
|
|
||||||
|
|
||||||
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("start: %v", err)
|
|
||||||
}
|
|
||||||
// Give the worker a moment to dispatch h1.
|
|
||||||
time.Sleep(100 * time.Millisecond)
|
|
||||||
if err := w.Cancel(context.Background(), fuID); err != nil {
|
|
||||||
t.Fatalf("cancel: %v", err)
|
|
||||||
}
|
|
||||||
waitForStatus(t, st, fuID, "cancelled", 2*time.Second)
|
|
||||||
|
|
||||||
// h2 should never be dispatched.
|
|
||||||
disp.mu.Lock()
|
|
||||||
defer disp.mu.Unlock()
|
|
||||||
for _, c := range disp.calls {
|
|
||||||
if c == h2 {
|
|
||||||
t.Errorf("h2 dispatched after cancel")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestWorkerStartWhileActiveErrors(t *testing.T) {
|
|
||||||
st := openStore(t)
|
|
||||||
uid := mustCreateAdmin(t, st)
|
|
||||||
h1 := mustCreateHost(t, st, "h1", "v0")
|
|
||||||
h2 := mustCreateHost(t, st, "h2", "v0")
|
|
||||||
hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
|
|
||||||
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 5_000}
|
|
||||||
w := NewWorker(st, hub, disp, &recAlert{})
|
|
||||||
w.pollPeriod = 50 * time.Millisecond
|
|
||||||
w.hostTimeout = 2 * time.Second
|
|
||||||
if _, err := w.Start(context.Background(), uid, "v2", []string{h1}); err != nil {
|
|
||||||
t.Fatalf("first start: %v", err)
|
|
||||||
}
|
|
||||||
_, err := w.Start(context.Background(), uid, "v2", []string{h2})
|
|
||||||
if !errors.Is(err, store.ErrFleetUpdateRunning) {
|
|
||||||
t.Fatalf("err: %v want ErrFleetUpdateRunning", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -11,7 +11,6 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func makeFilterHosts() []store.Host {
|
func makeFilterHosts() []store.Host {
|
||||||
@@ -99,23 +98,6 @@ func TestSortDashboardHostsColumns(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestFilterAndSortDashboardUpdatesBehind: ?updates=behind narrows
|
|
||||||
// to hosts whose agent_version is non-empty AND != server's version.
|
|
||||||
func TestFilterAndSortDashboardUpdatesBehind(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
hosts := []store.Host{
|
|
||||||
{ID: "01a", Name: "alpha", AgentVersion: "v0.0.1", Status: "online"},
|
|
||||||
{ID: "01b", Name: "bravo", AgentVersion: version.Version, Status: "online"},
|
|
||||||
{ID: "01c", Name: "charlie", AgentVersion: "", Status: "online"}, // never seen
|
|
||||||
{ID: "01d", Name: "delta", AgentVersion: "v0.0.1", Status: "offline"},
|
|
||||||
}
|
|
||||||
got := filterAndSortDashboardHosts(hosts, dashboardFilter{Updates: "behind", Sort: "name", Dir: "asc"})
|
|
||||||
// alpha + delta both behind; bravo (current) and charlie (empty) excluded.
|
|
||||||
if len(got) != 2 || got[0].Name != "alpha" || got[1].Name != "delta" {
|
|
||||||
t.Errorf("updates=behind: got %v", namesOf(got))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestParseDashboardFilterDefaults: empty query gives sort=name asc.
|
// TestParseDashboardFilterDefaults: empty query gives sort=name asc.
|
||||||
func TestParseDashboardFilterDefaults(t *testing.T) {
|
func TestParseDashboardFilterDefaults(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|||||||
@@ -1,379 +0,0 @@
|
|||||||
// fleet_update.go — admin-only fleet rolling-update endpoints + page.
|
|
||||||
//
|
|
||||||
// Surface:
|
|
||||||
// - POST /api/fleet/update → starts a fleet update (JSON)
|
|
||||||
// - POST /api/fleet-updates/{id}/cancel
|
|
||||||
// - GET /api/fleet-updates/{id} → JSON parent + per-host array
|
|
||||||
// - GET /settings/fleet-update → admin UI page
|
|
||||||
// - GET /settings/fleet-update/partial → htmx polling fragment
|
|
||||||
//
|
|
||||||
// All routes are mounted in the admin band (see routes()).
|
|
||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
"errors"
|
|
||||||
"log/slog"
|
|
||||||
stdhttp "net/http"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/go-chi/chi/v5"
|
|
||||||
"github.com/oklog/ulid/v2"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
|
||||||
|
|
||||||
// fleetUpdateStartReq is the JSON body for POST /api/fleet/update.
|
|
||||||
// Both fields are optional: empty target_version defaults to the
|
|
||||||
// server's current version, empty host_ids derives the out-of-date
|
|
||||||
// online subset.
|
|
||||||
type fleetUpdateStartReq struct {
|
|
||||||
TargetVersion string `json:"target_version,omitempty"`
|
|
||||||
HostIDs []string `json:"host_ids,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// fleetUpdateHostView is one row in the JSON response for GET
|
|
||||||
// /api/fleet-updates/{id}. Hostname is hydrated from the store so
|
|
||||||
// callers don't need a second round-trip per host.
|
|
||||||
type fleetUpdateHostView struct {
|
|
||||||
HostID string `json:"host_id"`
|
|
||||||
HostName string `json:"host_name,omitempty"`
|
|
||||||
Position int `json:"position"`
|
|
||||||
Status string `json:"status"`
|
|
||||||
JobID string `json:"job_id,omitempty"`
|
|
||||||
FailedReason string `json:"failed_reason,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// fleetUpdateView is the JSON projection of the parent + children.
|
|
||||||
type fleetUpdateView struct {
|
|
||||||
ID string `json:"id"`
|
|
||||||
StartedAt string `json:"started_at"`
|
|
||||||
StartedByUserID string `json:"started_by_user_id"`
|
|
||||||
TargetVersion string `json:"target_version"`
|
|
||||||
Status string `json:"status"`
|
|
||||||
CurrentHostID string `json:"current_host_id,omitempty"`
|
|
||||||
HaltedReason string `json:"halted_reason,omitempty"`
|
|
||||||
CompletedAt *string `json:"completed_at,omitempty"`
|
|
||||||
Hosts []fleetUpdateHostView `json:"hosts"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// fleetUpdatePage backs both the full /settings/fleet-update page
|
|
||||||
// and the partial polled fragment. Idle / Active are mutually
|
|
||||||
// exclusive: if Active is non-nil, render the progress view.
|
|
||||||
type fleetUpdatePage struct {
|
|
||||||
// Idle-state fields.
|
|
||||||
OutOfDateHosts []store.Host // online hosts whose version != target
|
|
||||||
TargetVersion string
|
|
||||||
|
|
||||||
// Active-state fields. Nil when no fleet update has ever run.
|
|
||||||
Active *store.FleetUpdate
|
|
||||||
ActiveRows []fleetUpdateHostView
|
|
||||||
|
|
||||||
// Common.
|
|
||||||
HostNames map[string]string
|
|
||||||
// PollURL is the partial endpoint htmx polls every few seconds.
|
|
||||||
PollURL string
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleAPIFleetUpdateStart is POST /api/fleet/update.
|
|
||||||
func (s *Server) handleAPIFleetUpdateStart(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
user, ok := s.requireUser(r)
|
|
||||||
if !ok {
|
|
||||||
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if s.deps.FleetWorker == nil {
|
|
||||||
writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
var body fleetUpdateStartReq
|
|
||||||
// Empty body is fine — both fields are optional.
|
|
||||||
if r.ContentLength != 0 {
|
|
||||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
|
||||||
writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
target := body.TargetVersion
|
|
||||||
if target == "" {
|
|
||||||
target = version.Version
|
|
||||||
}
|
|
||||||
hostIDs := body.HostIDs
|
|
||||||
if len(hostIDs) == 0 {
|
|
||||||
derived, err := s.deriveOutOfDateOnlineHostIDs(r.Context(), target)
|
|
||||||
if err != nil {
|
|
||||||
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
hostIDs = derived
|
|
||||||
}
|
|
||||||
if len(hostIDs) == 0 {
|
|
||||||
writeJSONError(w, stdhttp.StatusConflict, "no_hosts_eligible",
|
|
||||||
"no online hosts are out of date")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
fuID, err := s.deps.FleetWorker.Start(r.Context(), user.ID, target, hostIDs)
|
|
||||||
if err != nil {
|
|
||||||
if errors.Is(err, store.ErrFleetUpdateRunning) {
|
|
||||||
writeJSONError(w, stdhttp.StatusConflict, "fleet_update_in_progress", err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
auditPayload, _ := json.Marshal(map[string]any{
|
|
||||||
"fleet_update_id": fuID,
|
|
||||||
"target_version": target,
|
|
||||||
"host_count": len(hostIDs),
|
|
||||||
})
|
|
||||||
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
|
|
||||||
ID: ulid.Make().String(), UserID: &user.ID, Actor: "user",
|
|
||||||
Action: "fleet.update_started",
|
|
||||||
TargetKind: ptr("fleet_update"), TargetID: &fuID,
|
|
||||||
TS: time.Now().UTC(),
|
|
||||||
Payload: auditPayload,
|
|
||||||
})
|
|
||||||
|
|
||||||
writeJSON(w, stdhttp.StatusAccepted, map[string]string{"fleet_update_id": fuID})
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleAPIFleetUpdateCancel is POST /api/fleet-updates/{id}/cancel.
|
|
||||||
func (s *Server) handleAPIFleetUpdateCancel(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
user, ok := s.requireUser(r)
|
|
||||||
if !ok {
|
|
||||||
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if s.deps.FleetWorker == nil {
|
|
||||||
writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
fuID := chi.URLParam(r, "id")
|
|
||||||
if fuID == "" {
|
|
||||||
writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
fu, _, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID)
|
|
||||||
if err != nil {
|
|
||||||
if errors.Is(err, store.ErrNotFound) {
|
|
||||||
writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if fu.Status != "running" {
|
|
||||||
writeJSONError(w, stdhttp.StatusConflict, "fleet_update_not_running",
|
|
||||||
"fleet update is not in the running state")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if err := s.deps.FleetWorker.Cancel(r.Context(), fuID); err != nil {
|
|
||||||
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
|
|
||||||
ID: ulid.Make().String(), UserID: &user.ID, Actor: "user",
|
|
||||||
Action: "fleet.update_cancelled",
|
|
||||||
TargetKind: ptr("fleet_update"), TargetID: &fuID,
|
|
||||||
TS: time.Now().UTC(),
|
|
||||||
})
|
|
||||||
w.WriteHeader(stdhttp.StatusNoContent)
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleAPIFleetUpdateGet is GET /api/fleet-updates/{id}.
|
|
||||||
func (s *Server) handleAPIFleetUpdateGet(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
if _, ok := s.requireUser(r); !ok {
|
|
||||||
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
fuID := chi.URLParam(r, "id")
|
|
||||||
fu, hosts, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID)
|
|
||||||
if err != nil {
|
|
||||||
if errors.Is(err, store.ErrNotFound) {
|
|
||||||
writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
names := s.hostNameMap(r)
|
|
||||||
view := fleetUpdateView{
|
|
||||||
ID: fu.ID,
|
|
||||||
StartedAt: fu.StartedAt.UTC().Format(time.RFC3339Nano),
|
|
||||||
StartedByUserID: fu.StartedByUserID,
|
|
||||||
TargetVersion: fu.TargetVersion,
|
|
||||||
Status: fu.Status,
|
|
||||||
CurrentHostID: fu.CurrentHostID,
|
|
||||||
HaltedReason: fu.HaltedReason,
|
|
||||||
Hosts: make([]fleetUpdateHostView, 0, len(hosts)),
|
|
||||||
}
|
|
||||||
if fu.CompletedAt != nil {
|
|
||||||
s := fu.CompletedAt.UTC().Format(time.RFC3339Nano)
|
|
||||||
view.CompletedAt = &s
|
|
||||||
}
|
|
||||||
for _, h := range hosts {
|
|
||||||
view.Hosts = append(view.Hosts, fleetUpdateHostView{
|
|
||||||
HostID: h.HostID,
|
|
||||||
HostName: names[h.HostID],
|
|
||||||
Position: h.Position,
|
|
||||||
Status: h.Status,
|
|
||||||
JobID: h.JobID,
|
|
||||||
FailedReason: h.FailedReason,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
writeJSON(w, stdhttp.StatusOK, view)
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleUIFleetUpdate renders /settings/fleet-update.
|
|
||||||
func (s *Server) handleUIFleetUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
u := s.requireUIUser(w, r)
|
|
||||||
if u == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
page, err := s.buildFleetUpdatePage(r)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("ui fleet update: build page", "err", err)
|
|
||||||
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
view := s.baseView(r, u)
|
|
||||||
view.Title = "Fleet update · restic-manager"
|
|
||||||
view.Active = "settings"
|
|
||||||
view.Page = page
|
|
||||||
if err := s.deps.UI.Render(w, "fleet_update", view); err != nil {
|
|
||||||
slog.Error("ui fleet update: render", "err", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleUIFleetUpdatePartial renders just the inner panel for htmx
|
|
||||||
// auto-refresh polling — same data, no chrome.
|
|
||||||
func (s *Server) handleUIFleetUpdatePartial(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
u := s.requireUIUser(w, r)
|
|
||||||
if u == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
page, err := s.buildFleetUpdatePage(r)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("ui fleet update partial: build page", "err", err)
|
|
||||||
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
view := s.baseView(r, u)
|
|
||||||
view.Page = page
|
|
||||||
if err := s.deps.UI.RenderPartial(w, "fleet_update_inner", view); err != nil {
|
|
||||||
slog.Error("ui fleet update partial: render", "err", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// buildFleetUpdatePage assembles the data both /settings/fleet-update
|
|
||||||
// and its partial render against. Resolves the most-recent fleet
|
|
||||||
// update (active OR completed/cancelled/halted) so the page can show
|
|
||||||
// the last roll's result instead of disappearing into "idle" the
|
|
||||||
// instant a roll finishes.
|
|
||||||
func (s *Server) buildFleetUpdatePage(r *stdhttp.Request) (fleetUpdatePage, error) {
|
|
||||||
page := fleetUpdatePage{
|
|
||||||
TargetVersion: version.Version,
|
|
||||||
HostNames: map[string]string{},
|
|
||||||
PollURL: "/settings/fleet-update/partial",
|
|
||||||
}
|
|
||||||
hosts, err := s.deps.Store.ListHosts(r.Context())
|
|
||||||
if err != nil {
|
|
||||||
return page, err
|
|
||||||
}
|
|
||||||
for _, h := range hosts {
|
|
||||||
page.HostNames[h.ID] = h.Name
|
|
||||||
}
|
|
||||||
|
|
||||||
active, err := s.deps.Store.ActiveFleetUpdate(r.Context())
|
|
||||||
if err != nil {
|
|
||||||
return page, err
|
|
||||||
}
|
|
||||||
mostRecent := active
|
|
||||||
if mostRecent == nil {
|
|
||||||
// Fall back to the most recent terminal row so the page can
|
|
||||||
// show "completed" / "halted" / "cancelled" once the worker
|
|
||||||
// finishes. One small bespoke query — keeps the page from
|
|
||||||
// flashing back to "idle" the instant a roll wraps up.
|
|
||||||
var id string
|
|
||||||
err := s.deps.Store.DB().QueryRowContext(r.Context(),
|
|
||||||
`SELECT id FROM fleet_updates ORDER BY started_at DESC LIMIT 1`).
|
|
||||||
Scan(&id)
|
|
||||||
if err == nil {
|
|
||||||
fu, _, gerr := s.deps.Store.GetFleetUpdate(r.Context(), id)
|
|
||||||
if gerr == nil {
|
|
||||||
mostRecent = fu
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if mostRecent != nil {
|
|
||||||
_, rows, gerr := s.deps.Store.GetFleetUpdate(r.Context(), mostRecent.ID)
|
|
||||||
if gerr == nil {
|
|
||||||
page.Active = mostRecent
|
|
||||||
page.ActiveRows = make([]fleetUpdateHostView, 0, len(rows))
|
|
||||||
for _, hr := range rows {
|
|
||||||
page.ActiveRows = append(page.ActiveRows, fleetUpdateHostView{
|
|
||||||
HostID: hr.HostID,
|
|
||||||
HostName: page.HostNames[hr.HostID],
|
|
||||||
Position: hr.Position,
|
|
||||||
Status: hr.Status,
|
|
||||||
JobID: hr.JobID,
|
|
||||||
FailedReason: hr.FailedReason,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Idle list (or "still out of date" reference even when an active
|
|
||||||
// roll is running — cheap to compute, harmless to attach).
|
|
||||||
for _, h := range hosts {
|
|
||||||
if h.Status != "online" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if h.AgentVersion == "" || h.AgentVersion == page.TargetVersion {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
page.OutOfDateHosts = append(page.OutOfDateHosts, h)
|
|
||||||
}
|
|
||||||
return page, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// deriveOutOfDateOnlineHostIDs returns the list of host IDs that
|
|
||||||
// (a) are online (Hub.Connected) and (b) have an agent_version that's
|
|
||||||
// non-empty AND != target. Used by the start endpoint when the caller
|
|
||||||
// omits host_ids.
|
|
||||||
func (s *Server) deriveOutOfDateOnlineHostIDs(ctx context.Context, target string) ([]string, error) {
|
|
||||||
hosts, err := s.deps.Store.ListHosts(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
out := []string{}
|
|
||||||
for _, h := range hosts {
|
|
||||||
if h.AgentVersion == "" || h.AgentVersion == target {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !s.deps.Hub.Connected(h.ID) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
out = append(out, h.ID)
|
|
||||||
}
|
|
||||||
return out, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// hostNameMap returns hostID → name; used to hydrate fleet-update
|
|
||||||
// JSON responses.
|
|
||||||
func (s *Server) hostNameMap(r *stdhttp.Request) map[string]string {
|
|
||||||
out := map[string]string{}
|
|
||||||
hosts, err := s.deps.Store.ListHosts(r.Context())
|
|
||||||
if err != nil {
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
for _, h := range hosts {
|
|
||||||
out[h.ID] = h.Name
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
@@ -1,334 +0,0 @@
|
|||||||
// fleet_update_test.go — coverage for the P6-15 fleet-update HTTP
|
|
||||||
// surface: start/cancel/get JSON endpoints + RBAC.
|
|
||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
stdhttp "net/http"
|
|
||||||
"sync"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/oklog/ulid/v2"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
|
||||||
|
|
||||||
// fakeFleetWorker stands in for *fleetupdate.Worker in HTTP tests.
|
|
||||||
// It records what was passed to Start/Cancel and lets tests inject
|
|
||||||
// canned errors. Satisfies the FleetWorker interface in
|
|
||||||
// host_update.go.
|
|
||||||
type fakeFleetWorker struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
|
|
||||||
startCalls []fakeStartCall
|
|
||||||
startID string
|
|
||||||
startErr error
|
|
||||||
|
|
||||||
cancelCalls []string
|
|
||||||
cancelErr error
|
|
||||||
}
|
|
||||||
|
|
||||||
type fakeStartCall struct {
|
|
||||||
UserID string
|
|
||||||
Target string
|
|
||||||
HostIDs []string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *fakeFleetWorker) Start(_ context.Context, userID, target string, hostIDs []string) (string, error) {
|
|
||||||
f.mu.Lock()
|
|
||||||
defer f.mu.Unlock()
|
|
||||||
f.startCalls = append(f.startCalls, fakeStartCall{userID, target, append([]string(nil), hostIDs...)})
|
|
||||||
if f.startErr != nil {
|
|
||||||
return "", f.startErr
|
|
||||||
}
|
|
||||||
return f.startID, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *fakeFleetWorker) Cancel(_ context.Context, id string) error {
|
|
||||||
f.mu.Lock()
|
|
||||||
defer f.mu.Unlock()
|
|
||||||
f.cancelCalls = append(f.cancelCalls, id)
|
|
||||||
return f.cancelErr
|
|
||||||
}
|
|
||||||
|
|
||||||
// helloOnlineHost is the smallest setup that lets the dispatch /
|
|
||||||
// derivation logic see a host as "online + version mismatch".
|
|
||||||
// Returns the host id.
|
|
||||||
func helloOnlineHost(t *testing.T, srv *Server, st *store.Store, name, agentVer string) string {
|
|
||||||
t.Helper()
|
|
||||||
id := makeHost(t, st, name)
|
|
||||||
if err := st.MarkHostHello(context.Background(), id, agentVer, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
|
||||||
t.Fatalf("mark hello: %v", err)
|
|
||||||
}
|
|
||||||
// Mark connected on the hub so deriveOutOfDateOnlineHostIDs
|
|
||||||
// considers it online without needing a real WS handshake. The
|
|
||||||
// Conn has a nil websocket pointer — tests never call Send on it.
|
|
||||||
srv.deps.Hub.Register(id, ws.NewConn(id, nil))
|
|
||||||
return id
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFleetUpdateStartHappyPath(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
srv, ts, st := rawTestServer(t)
|
|
||||||
worker := &fakeFleetWorker{startID: ulid.Make().String()}
|
|
||||||
srv.deps.FleetWorker = worker
|
|
||||||
|
|
||||||
cookie, uid := loginAsAdminWithID(t, st)
|
|
||||||
hostID := helloOnlineHost(t, srv, st, "fu-host", "v0")
|
|
||||||
|
|
||||||
body := map[string]any{"host_ids": []string{hostID}}
|
|
||||||
raw, _ := json.Marshal(body)
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader(raw))
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
req.Header.Set("Content-Type", "application/json")
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusAccepted {
|
|
||||||
t.Fatalf("status: got %d, want 202", res.StatusCode)
|
|
||||||
}
|
|
||||||
var out struct {
|
|
||||||
FleetUpdateID string `json:"fleet_update_id"`
|
|
||||||
}
|
|
||||||
if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
|
|
||||||
t.Fatalf("decode: %v", err)
|
|
||||||
}
|
|
||||||
if out.FleetUpdateID != worker.startID {
|
|
||||||
t.Fatalf("fleet_update_id: got %q, want %q", out.FleetUpdateID, worker.startID)
|
|
||||||
}
|
|
||||||
worker.mu.Lock()
|
|
||||||
if len(worker.startCalls) != 1 || worker.startCalls[0].UserID != uid {
|
|
||||||
t.Fatalf("start calls: %+v", worker.startCalls)
|
|
||||||
}
|
|
||||||
if got := worker.startCalls[0].HostIDs; len(got) != 1 || got[0] != hostID {
|
|
||||||
t.Fatalf("host_ids: %v", got)
|
|
||||||
}
|
|
||||||
worker.mu.Unlock()
|
|
||||||
|
|
||||||
// Audit row.
|
|
||||||
var n int
|
|
||||||
if err := st.DB().QueryRow(
|
|
||||||
`SELECT COUNT(*) FROM audit_log WHERE action = 'fleet.update_started' AND target_id = ?`,
|
|
||||||
out.FleetUpdateID).Scan(&n); err != nil {
|
|
||||||
t.Fatalf("audit count: %v", err)
|
|
||||||
}
|
|
||||||
if n != 1 {
|
|
||||||
t.Fatalf("audit rows: got %d, want 1", n)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFleetUpdateStartConflictWhenAlreadyRunning(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
srv, ts, st := rawTestServer(t)
|
|
||||||
worker := &fakeFleetWorker{startErr: store.ErrFleetUpdateRunning}
|
|
||||||
srv.deps.FleetWorker = worker
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
_ = helloOnlineHost(t, srv, st, "fu-host", "v0")
|
|
||||||
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
req.Header.Set("Content-Type", "application/json")
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusConflict {
|
|
||||||
t.Fatalf("status: got %d, want 409", res.StatusCode)
|
|
||||||
}
|
|
||||||
body := readJSONError(t, res.Body)
|
|
||||||
if body.Code != "fleet_update_in_progress" {
|
|
||||||
t.Fatalf("code: %q", body.Code)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFleetUpdateStartDerivesHostIDsWhenEmpty(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
srv, ts, st := rawTestServer(t)
|
|
||||||
worker := &fakeFleetWorker{startID: ulid.Make().String()}
|
|
||||||
srv.deps.FleetWorker = worker
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
|
|
||||||
// Two online + out-of-date, one online + at-target, one offline.
|
|
||||||
a := helloOnlineHost(t, srv, st, "behind-a", "v0")
|
|
||||||
b := helloOnlineHost(t, srv, st, "behind-b", "v0")
|
|
||||||
_ = helloOnlineHost(t, srv, st, "uptodate", version.Version)
|
|
||||||
offlineID := makeHost(t, st, "offline-host")
|
|
||||||
if err := st.MarkHostHello(context.Background(), offlineID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
|
||||||
t.Fatalf("mark hello: %v", err)
|
|
||||||
}
|
|
||||||
// Don't MarkOnline → derivation should skip.
|
|
||||||
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
req.Header.Set("Content-Type", "application/json")
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusAccepted {
|
|
||||||
t.Fatalf("status: got %d, want 202", res.StatusCode)
|
|
||||||
}
|
|
||||||
worker.mu.Lock()
|
|
||||||
defer worker.mu.Unlock()
|
|
||||||
if len(worker.startCalls) != 1 {
|
|
||||||
t.Fatalf("start calls: %d", len(worker.startCalls))
|
|
||||||
}
|
|
||||||
got := worker.startCalls[0].HostIDs
|
|
||||||
want := map[string]bool{a: true, b: true}
|
|
||||||
if len(got) != 2 || !want[got[0]] || !want[got[1]] {
|
|
||||||
t.Fatalf("derived host_ids: got %v, want both of %v", got, []string{a, b})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFleetUpdateCancelHappyPath(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
srv, ts, st := rawTestServer(t)
|
|
||||||
worker := &fakeFleetWorker{}
|
|
||||||
srv.deps.FleetWorker = worker
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
|
|
||||||
// Seed a running fleet update directly.
|
|
||||||
fuID := ulid.Make().String()
|
|
||||||
uid := ulid.Make().String()
|
|
||||||
if err := st.CreateUser(context.Background(), store.User{
|
|
||||||
ID: uid, Username: "starter", PasswordHash: "x",
|
|
||||||
Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
|
|
||||||
}); err != nil {
|
|
||||||
t.Fatalf("seed user: %v", err)
|
|
||||||
}
|
|
||||||
hostID := makeHost(t, st, "fu-cancel-host")
|
|
||||||
if err := st.CreateFleetUpdate(context.Background(),
|
|
||||||
store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"},
|
|
||||||
[]string{hostID}); err != nil {
|
|
||||||
t.Fatalf("seed fleet update: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusNoContent {
|
|
||||||
t.Fatalf("status: got %d, want 204", res.StatusCode)
|
|
||||||
}
|
|
||||||
worker.mu.Lock()
|
|
||||||
if len(worker.cancelCalls) != 1 || worker.cancelCalls[0] != fuID {
|
|
||||||
t.Fatalf("cancel calls: %v", worker.cancelCalls)
|
|
||||||
}
|
|
||||||
worker.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFleetUpdateCancelNotRunning(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
srv, ts, st := rawTestServer(t)
|
|
||||||
srv.deps.FleetWorker = &fakeFleetWorker{}
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
|
|
||||||
// Seed + complete one so it's no longer running.
|
|
||||||
fuID := ulid.Make().String()
|
|
||||||
uid := ulid.Make().String()
|
|
||||||
_ = st.CreateUser(context.Background(), store.User{
|
|
||||||
ID: uid, Username: "starter2", PasswordHash: "x",
|
|
||||||
Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
|
|
||||||
})
|
|
||||||
hostID := makeHost(t, st, "fu-done-host")
|
|
||||||
_ = st.CreateFleetUpdate(context.Background(),
|
|
||||||
store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"},
|
|
||||||
[]string{hostID})
|
|
||||||
if err := st.CompleteFleetUpdate(context.Background(), fuID, time.Now().UTC()); err != nil {
|
|
||||||
t.Fatalf("complete: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusConflict {
|
|
||||||
t.Fatalf("status: got %d, want 409", res.StatusCode)
|
|
||||||
}
|
|
||||||
body := readJSONError(t, res.Body)
|
|
||||||
if body.Code != "fleet_update_not_running" {
|
|
||||||
t.Fatalf("code: %q", body.Code)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFleetUpdateGetHydrates(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, ts, st := rawTestServer(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
|
|
||||||
uid := ulid.Make().String()
|
|
||||||
_ = st.CreateUser(context.Background(), store.User{
|
|
||||||
ID: uid, Username: "starter3", PasswordHash: "x",
|
|
||||||
Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
|
|
||||||
})
|
|
||||||
hostID := makeHost(t, st, "fu-get-host")
|
|
||||||
fuID := ulid.Make().String()
|
|
||||||
if err := st.CreateFleetUpdate(context.Background(),
|
|
||||||
store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1.2.3"},
|
|
||||||
[]string{hostID}); err != nil {
|
|
||||||
t.Fatalf("seed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
req, _ := stdhttp.NewRequest("GET", ts.URL+"/api/fleet-updates/"+fuID, nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusOK {
|
|
||||||
t.Fatalf("status: got %d, want 200", res.StatusCode)
|
|
||||||
}
|
|
||||||
var got fleetUpdateView
|
|
||||||
if err := json.NewDecoder(res.Body).Decode(&got); err != nil {
|
|
||||||
t.Fatalf("decode: %v", err)
|
|
||||||
}
|
|
||||||
if got.ID != fuID || got.TargetVersion != "v1.2.3" || got.Status != "running" {
|
|
||||||
t.Fatalf("parent: %+v", got)
|
|
||||||
}
|
|
||||||
if len(got.Hosts) != 1 || got.Hosts[0].HostID != hostID || got.Hosts[0].HostName != "fu-get-host" {
|
|
||||||
t.Fatalf("hosts: %+v", got.Hosts)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFleetUpdateRBAC(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, ts, st := rawTestServer(t)
|
|
||||||
|
|
||||||
for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
|
|
||||||
role := role
|
|
||||||
t.Run(string(role), func(t *testing.T) {
|
|
||||||
cookie := loginAsRole(t, st, role)
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
req.Header.Set("Content-Type", "application/json")
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusForbidden {
|
|
||||||
t.Fatalf("status: got %d, want 403", res.StatusCode)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sanity check that fakeFleetWorker satisfies the FleetWorker iface.
|
|
||||||
var _ FleetWorker = (*fakeFleetWorker)(nil)
|
|
||||||
@@ -1,217 +0,0 @@
|
|||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
stdhttp "net/http"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/go-chi/chi/v5"
|
|
||||||
"github.com/oklog/ulid/v2"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
|
||||||
|
|
||||||
// UpdateWatcher is the slim view of the ws.updateWatcher this package
|
|
||||||
// uses for tracking in-flight update dispatches. Defined as an
|
|
||||||
// interface so a test can inject a stub.
|
|
||||||
type UpdateWatcher interface {
|
|
||||||
Track(jobID, hostID string)
|
|
||||||
}
|
|
||||||
|
|
||||||
// FleetWorker is the slim view of the fleetupdate.Worker this package
|
|
||||||
// uses. Kept here for forward compatibility with P6-15 — the host
|
|
||||||
// update endpoint itself does not use it.
|
|
||||||
type FleetWorker interface {
|
|
||||||
Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error)
|
|
||||||
Cancel(ctx context.Context, fleetUpdateID string) error
|
|
||||||
}
|
|
||||||
|
|
||||||
// dispatchHostUpdateResult communicates structured outcomes from the
|
|
||||||
// shared dispatch path so both the HTTP handler and the fleet worker
|
|
||||||
// can format errors in their own idiom.
|
|
||||||
type dispatchHostUpdateResult struct {
|
|
||||||
JobID string
|
|
||||||
Code string // "" on success
|
|
||||||
Status int // HTTP status the JSON handler should use on error
|
|
||||||
Msg string // human-readable detail (optional)
|
|
||||||
}
|
|
||||||
|
|
||||||
// dispatchHostUpdate is the shared "send command.update to one host"
|
|
||||||
// path. It performs every pre-check (host exists, online, version
|
|
||||||
// mismatch, no in-flight update) and on success creates the jobs row,
|
|
||||||
// audits, dispatches the WS envelope, and tracks the watcher entry.
|
|
||||||
//
|
|
||||||
// Pre-checks are returned as structured codes rather than HTTP errors
|
|
||||||
// so the fleet worker can map them onto its own per-host status enum
|
|
||||||
// without parsing strings.
|
|
||||||
func (s *Server) dispatchHostUpdate(ctx context.Context, hostID string, actorKind string, actorID *string) dispatchHostUpdateResult {
|
|
||||||
host, err := s.deps.Store.GetHost(ctx, hostID)
|
|
||||||
if err != nil || host == nil {
|
|
||||||
return dispatchHostUpdateResult{Code: "host_not_found", Status: stdhttp.StatusNotFound}
|
|
||||||
}
|
|
||||||
if !s.deps.Hub.Connected(host.ID) {
|
|
||||||
return dispatchHostUpdateResult{
|
|
||||||
Code: "host_offline", Status: stdhttp.StatusConflict,
|
|
||||||
Msg: "agent is not currently connected",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if host.AgentVersion != "" && host.AgentVersion == version.Version {
|
|
||||||
return dispatchHostUpdateResult{
|
|
||||||
Code: "already_up_to_date", Status: stdhttp.StatusConflict,
|
|
||||||
Msg: "agent already running version " + version.Version,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
existing, err := s.deps.Store.RunningUpdateJobForHost(ctx, hostID)
|
|
||||||
if err != nil {
|
|
||||||
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
|
|
||||||
}
|
|
||||||
if existing != "" {
|
|
||||||
return dispatchHostUpdateResult{
|
|
||||||
Code: "update_in_progress", Status: stdhttp.StatusConflict,
|
|
||||||
Msg: "an update job is already in flight for this host",
|
|
||||||
JobID: existing,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
jobID := ulid.Make().String()
|
|
||||||
now := time.Now().UTC()
|
|
||||||
if err := s.deps.Store.CreateJob(ctx, store.Job{
|
|
||||||
ID: jobID, HostID: hostID, Kind: "update",
|
|
||||||
ActorKind: actorKind, ActorID: actorID,
|
|
||||||
CreatedAt: now,
|
|
||||||
}); err != nil {
|
|
||||||
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
|
|
||||||
}
|
|
||||||
env, err := api.Marshal(api.MsgCommandUpdate, ulid.Make().String(), api.CommandUpdatePayload{
|
|
||||||
JobID: jobID,
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
|
|
||||||
}
|
|
||||||
if err := s.deps.Hub.Send(ctx, hostID, env); err != nil {
|
|
||||||
// Roll the job to failed so we don't leak a queued row.
|
|
||||||
_ = s.deps.Store.MarkJobFinished(ctx, jobID, "failed", -1, nil, err.Error(), time.Now().UTC())
|
|
||||||
return dispatchHostUpdateResult{
|
|
||||||
Code: "host_offline", Status: stdhttp.StatusConflict, Msg: err.Error(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if s.deps.UpdateWatcher != nil {
|
|
||||||
s.deps.UpdateWatcher.Track(jobID, hostID)
|
|
||||||
}
|
|
||||||
|
|
||||||
auditPayload, _ := json.Marshal(map[string]string{
|
|
||||||
"job_id": jobID,
|
|
||||||
"target_version": version.Version,
|
|
||||||
})
|
|
||||||
_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
|
|
||||||
ID: ulid.Make().String(),
|
|
||||||
UserID: actorID,
|
|
||||||
Actor: actorKind,
|
|
||||||
Action: "host.update_dispatched",
|
|
||||||
TargetKind: ptr("host"),
|
|
||||||
TargetID: &hostID,
|
|
||||||
TS: now,
|
|
||||||
Payload: auditPayload,
|
|
||||||
})
|
|
||||||
|
|
||||||
return dispatchHostUpdateResult{JobID: jobID}
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleHostUpdate is POST /api/hosts/{id}/update — JSON, admin-only.
|
|
||||||
func (s *Server) handleHostUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
user, ok := s.requireUser(r)
|
|
||||||
if !ok {
|
|
||||||
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
hostID := chi.URLParam(r, "id")
|
|
||||||
if hostID == "" {
|
|
||||||
writeJSONError(w, stdhttp.StatusBadRequest, "missing_host_id", "")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
actor := "user"
|
|
||||||
var actorID *string
|
|
||||||
if user != nil {
|
|
||||||
actorID = &user.ID
|
|
||||||
}
|
|
||||||
res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
|
|
||||||
if res.Code != "" {
|
|
||||||
writeJSONError(w, res.Status, res.Code, res.Msg)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
writeJSON(w, stdhttp.StatusAccepted, map[string]string{"job_id": res.JobID})
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleHostUpdateForm is the HTMX-friendly POST /hosts/{id}/update
|
|
||||||
// variant. On success it sets HX-Redirect to the job detail page; on
|
|
||||||
// pre-check failures it renders an inline error banner.
|
|
||||||
func (s *Server) handleHostUpdateForm(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
user, ok := s.requireUser(r)
|
|
||||||
if !ok {
|
|
||||||
stdhttp.Error(w, "unauthorised", stdhttp.StatusUnauthorized)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
hostID := chi.URLParam(r, "id")
|
|
||||||
if hostID == "" {
|
|
||||||
stdhttp.Error(w, "missing host_id", stdhttp.StatusBadRequest)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
actor := "user"
|
|
||||||
var actorID *string
|
|
||||||
if user != nil {
|
|
||||||
actorID = &user.ID
|
|
||||||
}
|
|
||||||
res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
|
|
||||||
if res.Code != "" {
|
|
||||||
// Inline banner for HTMX swaps. Mirrors what host_credentials
|
|
||||||
// returns on validation errors — small text/html fragment.
|
|
||||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
|
||||||
w.WriteHeader(res.Status)
|
|
||||||
msg := hostUpdateErrorMessage(res.Code, res.Msg)
|
|
||||||
_, _ = w.Write([]byte(`<div class="banner banner-error" role="alert">` + htmlEscape(msg) + `</div>`))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
w.Header().Set("HX-Redirect", "/jobs/"+res.JobID)
|
|
||||||
w.WriteHeader(stdhttp.StatusOK)
|
|
||||||
}
|
|
||||||
|
|
||||||
func hostUpdateErrorMessage(code, msg string) string {
|
|
||||||
switch code {
|
|
||||||
case "host_not_found":
|
|
||||||
return "Host not found."
|
|
||||||
case "host_offline":
|
|
||||||
return "Agent is offline; can't deliver the update command."
|
|
||||||
case "already_up_to_date":
|
|
||||||
return "Agent is already running the current version."
|
|
||||||
case "update_in_progress":
|
|
||||||
return "An update is already in progress for this host."
|
|
||||||
}
|
|
||||||
if msg != "" {
|
|
||||||
return msg
|
|
||||||
}
|
|
||||||
return "Update dispatch failed."
|
|
||||||
}
|
|
||||||
|
|
||||||
// htmlEscape is a minimal HTML-attr-safe escaper. Avoids pulling html/template
|
|
||||||
// for a one-shot inline banner.
|
|
||||||
func htmlEscape(s string) string {
|
|
||||||
out := make([]byte, 0, len(s))
|
|
||||||
for i := 0; i < len(s); i++ {
|
|
||||||
switch s[i] {
|
|
||||||
case '&':
|
|
||||||
out = append(out, []byte("&")...)
|
|
||||||
case '<':
|
|
||||||
out = append(out, []byte("<")...)
|
|
||||||
case '>':
|
|
||||||
out = append(out, []byte(">")...)
|
|
||||||
case '"':
|
|
||||||
out = append(out, []byte(""")...)
|
|
||||||
default:
|
|
||||||
out = append(out, s[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return string(out)
|
|
||||||
}
|
|
||||||
@@ -1,270 +0,0 @@
|
|||||||
// host_update_test.go — covers POST /api/hosts/{id}/update.
|
|
||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
"io"
|
|
||||||
stdhttp "net/http"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/coder/websocket"
|
|
||||||
"github.com/oklog/ulid/v2"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
|
||||||
|
|
||||||
// stubWatcher records Track calls so tests can assert the watcher was
|
|
||||||
// notified.
|
|
||||||
type stubWatcher struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
tracked []string // hostIDs
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *stubWatcher) Track(_, hostID string) {
|
|
||||||
s.mu.Lock()
|
|
||||||
defer s.mu.Unlock()
|
|
||||||
s.tracked = append(s.tracked, hostID)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHostUpdateHappyPath(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
srv, ts, st := rawTestServer(t)
|
|
||||||
watcher := &stubWatcher{}
|
|
||||||
srv.deps.UpdateWatcher = watcher
|
|
||||||
hostID, token := enrolHostForWS(t, srv, st, "upd-host")
|
|
||||||
c := agentDial(t, srv, ts, hostID, token)
|
|
||||||
sendHello(t, c, "upd-host")
|
|
||||||
_ = drainUntil(t, c, api.MsgScheduleSet)
|
|
||||||
|
|
||||||
// Force a version mismatch so the dispatch isn't short-circuited.
|
|
||||||
if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
|
||||||
t.Fatalf("mark hello: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusAccepted {
|
|
||||||
t.Fatalf("status: got %d, want 202", res.StatusCode)
|
|
||||||
}
|
|
||||||
var out struct {
|
|
||||||
JobID string `json:"job_id"`
|
|
||||||
}
|
|
||||||
if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
|
|
||||||
t.Fatalf("decode: %v", err)
|
|
||||||
}
|
|
||||||
if out.JobID == "" {
|
|
||||||
t.Fatal("missing job_id in response")
|
|
||||||
}
|
|
||||||
|
|
||||||
// command.update envelope arrives.
|
|
||||||
deadline := time.Now().Add(2 * time.Second)
|
|
||||||
var got api.Envelope
|
|
||||||
for time.Now().Before(deadline) {
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
|
||||||
mt, raw, rerr := c.Read(ctx)
|
|
||||||
cancel()
|
|
||||||
if rerr != nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if mt != websocket.MessageText {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !strings.Contains(string(raw), `"command.update"`) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
_ = json.Unmarshal(raw, &got)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if got.Type != api.MsgCommandUpdate {
|
|
||||||
t.Fatal("never received command.update envelope")
|
|
||||||
}
|
|
||||||
var cp api.CommandUpdatePayload
|
|
||||||
if err := got.UnmarshalPayload(&cp); err != nil {
|
|
||||||
t.Fatalf("payload: %v", err)
|
|
||||||
}
|
|
||||||
if cp.JobID != out.JobID {
|
|
||||||
t.Fatalf("payload job_id: got %q want %q", cp.JobID, out.JobID)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Watcher tracked.
|
|
||||||
watcher.mu.Lock()
|
|
||||||
defer watcher.mu.Unlock()
|
|
||||||
if len(watcher.tracked) != 1 || watcher.tracked[0] != hostID {
|
|
||||||
t.Fatalf("watcher tracked: %v", watcher.tracked)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Audit row exists.
|
|
||||||
var n int
|
|
||||||
if err := st.DB().QueryRow(
|
|
||||||
`SELECT COUNT(*) FROM audit_log WHERE action = 'host.update_dispatched' AND target_id = ?`,
|
|
||||||
hostID).Scan(&n); err != nil {
|
|
||||||
t.Fatalf("audit count: %v", err)
|
|
||||||
}
|
|
||||||
if n != 1 {
|
|
||||||
t.Fatalf("audit rows: got %d, want 1", n)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHostUpdateNotFound(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, ts, st := rawTestServer(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/no-such/update", nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusNotFound {
|
|
||||||
t.Fatalf("status: got %d want 404", res.StatusCode)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHostUpdateOffline(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, ts, st := rawTestServer(t)
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
if err := st.CreateHost(context.Background(), store.Host{
|
|
||||||
ID: hostID, Name: "off", OS: "linux", Arch: "amd64",
|
|
||||||
EnrolledAt: time.Now().UTC(),
|
|
||||||
}, "deadbeef", ""); err != nil {
|
|
||||||
t.Fatalf("create: %v", err)
|
|
||||||
}
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusConflict {
|
|
||||||
t.Fatalf("status: got %d want 409", res.StatusCode)
|
|
||||||
}
|
|
||||||
body := readJSONError(t, res.Body)
|
|
||||||
if body.Code != "host_offline" {
|
|
||||||
t.Fatalf("code: %q", body.Code)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHostUpdateAlreadyUpToDate(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
srv, ts, st := rawTestServer(t)
|
|
||||||
hostID, token := enrolHostForWS(t, srv, st, "uptodate-host")
|
|
||||||
c := agentDial(t, srv, ts, hostID, token)
|
|
||||||
sendHello(t, c, "uptodate-host")
|
|
||||||
_ = drainUntil(t, c, api.MsgScheduleSet)
|
|
||||||
|
|
||||||
// Force agent_version == version.Version.
|
|
||||||
if err := st.MarkHostHello(context.Background(), hostID, version.Version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
|
||||||
t.Fatalf("mark hello: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusConflict {
|
|
||||||
t.Fatalf("status: got %d want 409", res.StatusCode)
|
|
||||||
}
|
|
||||||
body := readJSONError(t, res.Body)
|
|
||||||
if body.Code != "already_up_to_date" {
|
|
||||||
t.Fatalf("code: %q", body.Code)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHostUpdateInProgress(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
srv, ts, st := rawTestServer(t)
|
|
||||||
hostID, token := enrolHostForWS(t, srv, st, "inprog-host")
|
|
||||||
c := agentDial(t, srv, ts, hostID, token)
|
|
||||||
sendHello(t, c, "inprog-host")
|
|
||||||
_ = drainUntil(t, c, api.MsgScheduleSet)
|
|
||||||
if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
|
||||||
t.Fatalf("mark hello: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pre-seed an in-flight update job.
|
|
||||||
jobID := ulid.Make().String()
|
|
||||||
if err := st.CreateJob(context.Background(), store.Job{
|
|
||||||
ID: jobID, HostID: hostID, Kind: "update",
|
|
||||||
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
|
||||||
}); err != nil {
|
|
||||||
t.Fatalf("seed job: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusConflict {
|
|
||||||
t.Fatalf("status: got %d want 409", res.StatusCode)
|
|
||||||
}
|
|
||||||
body := readJSONError(t, res.Body)
|
|
||||||
if body.Code != "update_in_progress" {
|
|
||||||
t.Fatalf("code: %q", body.Code)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHostUpdateRBAC(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, ts, st := rawTestServer(t)
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
if err := st.CreateHost(context.Background(), store.Host{
|
|
||||||
ID: hostID, Name: "rbac-host", OS: "linux", Arch: "amd64",
|
|
||||||
EnrolledAt: time.Now().UTC(),
|
|
||||||
}, "deadbeef", ""); err != nil {
|
|
||||||
t.Fatalf("create: %v", err)
|
|
||||||
}
|
|
||||||
for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
|
|
||||||
role := role
|
|
||||||
t.Run(string(role), func(t *testing.T) {
|
|
||||||
cookie := loginAsRole(t, st, role)
|
|
||||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := stdhttp.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("do: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusForbidden {
|
|
||||||
t.Fatalf("status for %s: got %d want 403", role, res.StatusCode)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type jsonErrBody struct {
|
|
||||||
Code string `json:"code"`
|
|
||||||
Message string `json:"message,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func readJSONError(t *testing.T, body io.Reader) jsonErrBody {
|
|
||||||
t.Helper()
|
|
||||||
var out jsonErrBody
|
|
||||||
if err := json.NewDecoder(body).Decode(&out); err != nil {
|
|
||||||
t.Fatalf("decode error body: %v", err)
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
@@ -4,7 +4,6 @@ import (
|
|||||||
stdhttp "net/http"
|
stdhttp "net/http"
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// hostView is the JSON projection of a Host row. Same shape as the
|
// hostView is the JSON projection of a Host row. Same shape as the
|
||||||
@@ -28,8 +27,6 @@ type hostView struct {
|
|||||||
RepoSizeBytes int64 `json:"repo_size_bytes"`
|
RepoSizeBytes int64 `json:"repo_size_bytes"`
|
||||||
SnapshotCount int `json:"snapshot_count"`
|
SnapshotCount int `json:"snapshot_count"`
|
||||||
OpenAlertCount int `json:"open_alert_count"`
|
OpenAlertCount int `json:"open_alert_count"`
|
||||||
UpdateAvailable bool `json:"update_available"`
|
|
||||||
TargetVersion string `json:"target_version,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// handleListHosts returns the full fleet as JSON. Authenticated; the
|
// handleListHosts returns the full fleet as JSON. Authenticated; the
|
||||||
@@ -88,8 +85,6 @@ func hostToView(h store.Host) hostView {
|
|||||||
RepoSizeBytes: h.RepoSizeBytes,
|
RepoSizeBytes: h.RepoSizeBytes,
|
||||||
SnapshotCount: h.SnapshotCount,
|
SnapshotCount: h.SnapshotCount,
|
||||||
OpenAlertCount: h.OpenAlertCount,
|
OpenAlertCount: h.OpenAlertCount,
|
||||||
TargetVersion: version.Version,
|
|
||||||
UpdateAvailable: h.AgentVersion != "" && h.AgentVersion != version.Version,
|
|
||||||
}
|
}
|
||||||
if v.Tags == nil {
|
if v.Tags == nil {
|
||||||
v.Tags = []string{}
|
v.Tags = []string{}
|
||||||
|
|||||||
@@ -39,13 +39,6 @@ type Deps struct {
|
|||||||
// NotificationHub (optional, wired in G1) is used by the test-fire
|
// NotificationHub (optional, wired in G1) is used by the test-fire
|
||||||
// endpoint to dispatch a single synthetic payload through a channel.
|
// endpoint to dispatch a single synthetic payload through a channel.
|
||||||
NotificationHub *notification.Hub
|
NotificationHub *notification.Hub
|
||||||
// UpdateWatcher tracks in-flight agent self-update dispatches and
|
|
||||||
// reconciles them against incoming hello envelopes. Optional;
|
|
||||||
// nil = no-op (handlers degrade by skipping the Track call).
|
|
||||||
UpdateWatcher UpdateWatcher
|
|
||||||
// FleetWorker drives the rolling fleet-update worker. Optional;
|
|
||||||
// nil = fleet update endpoints (P6-15) report unavailable.
|
|
||||||
FleetWorker FleetWorker
|
|
||||||
// Version is the binary's build version, surfaced in the chrome.
|
// Version is the binary's build version, surfaced in the chrome.
|
||||||
// Empty falls back to "dev".
|
// Empty falls back to "dev".
|
||||||
Version string
|
Version string
|
||||||
@@ -130,9 +123,8 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
r.Post("/api/agents/announce", s.handleAnnounce)
|
r.Post("/api/agents/announce", s.handleAnnounce)
|
||||||
r.Get("/agent/binary", s.handleAgentBinary)
|
r.Get("/agent/binary", s.handleAgentBinary)
|
||||||
r.Get("/install/*", s.handleInstallAsset)
|
r.Get("/install/*", s.handleInstallAsset)
|
||||||
r.Get("/api/version", s.handleVersion)
|
|
||||||
if s.deps.Hub != nil {
|
if s.deps.Hub != nil {
|
||||||
hd := ws.HandlerDeps{
|
r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{
|
||||||
Hub: s.deps.Hub,
|
Hub: s.deps.Hub,
|
||||||
Store: s.deps.Store,
|
Store: s.deps.Store,
|
||||||
JobHub: s.deps.JobHub,
|
JobHub: s.deps.JobHub,
|
||||||
@@ -140,11 +132,7 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
OnHello: s.onAgentHello,
|
OnHello: s.onAgentHello,
|
||||||
OnScheduleAck: s.applyScheduleAck,
|
OnScheduleAck: s.applyScheduleAck,
|
||||||
OnScheduleFire: s.dispatchScheduledJob,
|
OnScheduleFire: s.dispatchScheduledJob,
|
||||||
}
|
}))
|
||||||
if w, ok := s.deps.UpdateWatcher.(*ws.UpdateWatcher); ok && w != nil {
|
|
||||||
hd.UpdateWatcher = w
|
|
||||||
}
|
|
||||||
r.Mount("/ws/agent", ws.AgentHandler(hd))
|
|
||||||
}
|
}
|
||||||
r.Get("/ws/agent/pending", s.handlePendingWS)
|
r.Get("/ws/agent/pending", s.handlePendingWS)
|
||||||
r.Mount("/static/", staticHandler())
|
r.Mount("/static/", staticHandler())
|
||||||
@@ -195,9 +183,7 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
r.Get("/hosts/{id}/sources", s.handleUIHostSources)
|
r.Get("/hosts/{id}/sources", s.handleUIHostSources)
|
||||||
r.Get("/hosts/{id}/sources/new", s.handleUISourceGroupNewGet)
|
r.Get("/hosts/{id}/sources/new", s.handleUISourceGroupNewGet)
|
||||||
r.Get("/hosts/{id}/sources/{gid}/edit", s.handleUISourceGroupEditGet)
|
r.Get("/hosts/{id}/sources/{gid}/edit", s.handleUISourceGroupEditGet)
|
||||||
r.Get("/hosts/{id}/jobs", s.handleUIHostJobs)
|
|
||||||
r.Get("/hosts/{id}/repo", s.handleUIHostRepo)
|
r.Get("/hosts/{id}/repo", s.handleUIHostRepo)
|
||||||
r.Get("/hosts/{id}/repo/trend", s.handleUIRepoTrend)
|
|
||||||
r.Get("/hosts/{id}/schedules", s.handleUISchedulesList)
|
r.Get("/hosts/{id}/schedules", s.handleUISchedulesList)
|
||||||
r.Get("/hosts/{id}/schedules/new", s.handleUIScheduleNewGet)
|
r.Get("/hosts/{id}/schedules/new", s.handleUIScheduleNewGet)
|
||||||
r.Get("/hosts/{id}/schedules/{sid}/edit", s.handleUIScheduleEditGet)
|
r.Get("/hosts/{id}/schedules/{sid}/edit", s.handleUIScheduleEditGet)
|
||||||
@@ -284,14 +270,6 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
r.Group(func(r chi.Router) {
|
r.Group(func(r chi.Router) {
|
||||||
r.Use(s.requireRole(store.RoleAdmin))
|
r.Use(s.requireRole(store.RoleAdmin))
|
||||||
|
|
||||||
r.Post("/api/hosts/{id}/update", s.handleHostUpdate)
|
|
||||||
r.Post("/hosts/{id}/update", s.handleHostUpdateForm)
|
|
||||||
|
|
||||||
// Fleet update (P6-15): rolling update across many hosts.
|
|
||||||
r.Post("/api/fleet/update", s.handleAPIFleetUpdateStart)
|
|
||||||
r.Post("/api/fleet-updates/{id}/cancel", s.handleAPIFleetUpdateCancel)
|
|
||||||
r.Get("/api/fleet-updates/{id}", s.handleAPIFleetUpdateGet)
|
|
||||||
|
|
||||||
r.Get("/api/users", s.handleAPIUsersList)
|
r.Get("/api/users", s.handleAPIUsersList)
|
||||||
r.Post("/api/users", s.handleAPIUserCreate)
|
r.Post("/api/users", s.handleAPIUserCreate)
|
||||||
r.Get("/api/users/{id}", s.handleAPIUserGet)
|
r.Get("/api/users/{id}", s.handleAPIUserGet)
|
||||||
@@ -305,8 +283,6 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
if s.deps.UI != nil {
|
if s.deps.UI != nil {
|
||||||
r.Post("/hosts/{id}/delete", s.handleUIHostDelete)
|
r.Post("/hosts/{id}/delete", s.handleUIHostDelete)
|
||||||
r.Get("/settings", s.handleUISettings)
|
r.Get("/settings", s.handleUISettings)
|
||||||
r.Get("/settings/fleet-update", s.handleUIFleetUpdate)
|
|
||||||
r.Get("/settings/fleet-update/partial", s.handleUIFleetUpdatePartial)
|
|
||||||
r.Get("/settings/users", s.handleUIUsersList)
|
r.Get("/settings/users", s.handleUIUsersList)
|
||||||
r.Get("/settings/users/new", s.handleUIUserNewGet)
|
r.Get("/settings/users/new", s.handleUIUserNewGet)
|
||||||
r.Post("/settings/users/new", s.handleUIUserNewPost)
|
r.Post("/settings/users/new", s.handleUIUserNewPost)
|
||||||
@@ -345,27 +321,6 @@ func (s *Server) Shutdown(ctx context.Context) error {
|
|||||||
return s.srv.Shutdown(ctx)
|
return s.srv.Shutdown(ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetFleetWorker installs the fleet-update worker post-construction.
|
|
||||||
// Used to break the wiring loop in cmd/server (the worker depends on a
|
|
||||||
// dispatcher that delegates back into the server's host-update path).
|
|
||||||
func (s *Server) SetFleetWorker(fw FleetWorker) { s.deps.FleetWorker = fw }
|
|
||||||
|
|
||||||
// DispatchHostUpdate is the public entry point for callers (the fleet
|
|
||||||
// worker) that need to drive the same dispatch path the HTTP handler
|
|
||||||
// uses, without going through HTTP. Returns the structured result so
|
|
||||||
// the caller can map error codes to its own status enum.
|
|
||||||
func (s *Server) DispatchHostUpdate(ctx context.Context, hostID, actorUserID string) (jobID string, code string, err error) {
|
|
||||||
var actorID *string
|
|
||||||
if actorUserID != "" {
|
|
||||||
actorID = &actorUserID
|
|
||||||
}
|
|
||||||
res := s.dispatchHostUpdate(ctx, hostID, "user", actorID)
|
|
||||||
if res.Code != "" {
|
|
||||||
return res.JobID, res.Code, nil
|
|
||||||
}
|
|
||||||
return res.JobID, "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Addr returns the configured listen address. Useful in tests when
|
// Addr returns the configured listen address. Useful in tests when
|
||||||
// the caller passes :0 to get a random port.
|
// the caller passes :0 to get a random port.
|
||||||
func (s *Server) Addr() string { return s.srv.Addr }
|
func (s *Server) Addr() string { return s.srv.Addr }
|
||||||
|
|||||||
@@ -1,83 +0,0 @@
|
|||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
stdhttp "net/http"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
)
|
|
||||||
|
|
||||||
func getDashboard(t *testing.T, baseURL string, cookie *stdhttp.Cookie) string {
|
|
||||||
t.Helper()
|
|
||||||
client := &stdhttp.Client{
|
|
||||||
CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
|
|
||||||
return stdhttp.ErrUseLastResponse
|
|
||||||
},
|
|
||||||
}
|
|
||||||
req, err := stdhttp.NewRequest("GET", baseURL+"/", nil)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("new request: %v", err)
|
|
||||||
}
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := client.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("GET /: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusOK {
|
|
||||||
t.Fatalf("GET /: want 200, got %d", res.StatusCode)
|
|
||||||
}
|
|
||||||
body := make([]byte, 0, 1<<20)
|
|
||||||
buf := make([]byte, 4096)
|
|
||||||
for {
|
|
||||||
n, rerr := res.Body.Read(buf)
|
|
||||||
body = append(body, buf[:n]...)
|
|
||||||
if rerr != nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return string(body)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDashboard_HostRowSparklineRendersWithHistory(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, baseURL, st := newTestServerWithUI(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
hostID := makeHost(t, st, "h-spark")
|
|
||||||
ctx := context.Background()
|
|
||||||
|
|
||||||
// Two history points → polyline must render.
|
|
||||||
for i, day := range []string{"2026-05-05", "2026-05-06"} {
|
|
||||||
v := int64(100 + i*50)
|
|
||||||
if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
|
|
||||||
store.HostRepoStats{TotalSizeBytes: &v}, time.Now().UTC()); err != nil {
|
|
||||||
t.Fatalf("upsert %s: %v", day, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
body := getDashboard(t, baseURL, cookie)
|
|
||||||
if !strings.Contains(body, `class="repo-sparkline"`) {
|
|
||||||
t.Errorf("expected sparkline SVG in dashboard body (class=repo-sparkline missing)")
|
|
||||||
}
|
|
||||||
if !strings.Contains(body, `<polyline`) {
|
|
||||||
t.Errorf("expected <polyline> in dashboard body")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDashboard_HostRowSparklineEmptyState(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, baseURL, st := newTestServerWithUI(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
makeHost(t, st, "h-empty")
|
|
||||||
|
|
||||||
body := getDashboard(t, baseURL, cookie)
|
|
||||||
if !strings.Contains(body, `class="repo-sparkline"`) {
|
|
||||||
t.Errorf("expected sparkline SVG element on dashboard")
|
|
||||||
}
|
|
||||||
if !strings.Contains(body, `>—<`) {
|
|
||||||
t.Errorf("expected em-dash placeholder in empty sparkline cell")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -5,10 +5,8 @@ import (
|
|||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"html/template"
|
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"math"
|
|
||||||
stdhttp "net/http"
|
stdhttp "net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"sort"
|
"sort"
|
||||||
@@ -25,8 +23,6 @@ import (
|
|||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/web/sparkline"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/web"
|
"gitea.dcglab.co.uk/steve/restic-manager/web"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -159,10 +155,6 @@ type dashboardPage struct {
|
|||||||
// when it's already active). Pre-computed so the template stays
|
// when it's already active). Pre-computed so the template stays
|
||||||
// dumb.
|
// dumb.
|
||||||
SortURL map[string]string
|
SortURL map[string]string
|
||||||
// UpdatesBehind is the count of online hosts whose agent_version
|
|
||||||
// trails the server. Surfaces as the dashboard "N hosts behind"
|
|
||||||
// hero tile and links to ?updates=behind.
|
|
||||||
UpdatesBehind int
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// dashboardFilter holds the parsed query-string filter state.
|
// dashboardFilter holds the parsed query-string filter state.
|
||||||
@@ -173,10 +165,6 @@ type dashboardFilter struct {
|
|||||||
Tag string // mirrors ActiveTag for round-trip on links
|
Tag string // mirrors ActiveTag for round-trip on links
|
||||||
Sort string // column key (see sortDashboard)
|
Sort string // column key (see sortDashboard)
|
||||||
Dir string // "asc" | "desc"
|
Dir string // "asc" | "desc"
|
||||||
// Updates narrows to hosts whose agent is behind the server's
|
|
||||||
// version. Only valid value today is "behind"; empty means no
|
|
||||||
// filter.
|
|
||||||
Updates string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// dashboardHostRow carries a host plus the per-row Run-now decision
|
// dashboardHostRow carries a host plus the per-row Run-now decision
|
||||||
@@ -192,17 +180,6 @@ type dashboardHostRow struct {
|
|||||||
// NextRun is the next-fire time of RunAllScheduleID (when set),
|
// NextRun is the next-fire time of RunAllScheduleID (when set),
|
||||||
// computed server-side from its cron. nil otherwise.
|
// computed server-side from its cron. nil otherwise.
|
||||||
NextRun *time.Time
|
NextRun *time.Time
|
||||||
// UpdateAvailable is true when the host's agent has connected at
|
|
||||||
// least once AND its agent_version differs from the server's. Used
|
|
||||||
// by the host_row partial to render the update-available chip.
|
|
||||||
UpdateAvailable bool
|
|
||||||
// TargetVersion is the server's build version, surfaced in the
|
|
||||||
// chip's tooltip and label.
|
|
||||||
TargetVersion string
|
|
||||||
// RepoSparklineSVG is a server-rendered inline SVG showing the
|
|
||||||
// 30-day repo-size trend. Empty-state SVG (em-dash) is returned
|
|
||||||
// when no history rows exist for the host.
|
|
||||||
RepoSparklineSVG template.HTML
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// pickRunAllSchedule returns the ID of the single schedule whose
|
// pickRunAllSchedule returns the ID of the single schedule whose
|
||||||
@@ -278,11 +255,7 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
|
|||||||
// calls per host — fine at fleet sizes we care about.
|
// calls per host — fine at fleet sizes we care about.
|
||||||
rows := make([]dashboardHostRow, 0, len(hosts))
|
rows := make([]dashboardHostRow, 0, len(hosts))
|
||||||
for _, h := range hosts {
|
for _, h := range hosts {
|
||||||
row := dashboardHostRow{
|
row := dashboardHostRow{Host: h}
|
||||||
Host: h,
|
|
||||||
TargetVersion: version.Version,
|
|
||||||
UpdateAvailable: h.AgentVersion != "" && h.AgentVersion != version.Version,
|
|
||||||
}
|
|
||||||
groups, gerr := s.deps.Store.ListSourceGroupsByHost(r.Context(), h.ID)
|
groups, gerr := s.deps.Store.ListSourceGroupsByHost(r.Context(), h.ID)
|
||||||
if gerr != nil {
|
if gerr != nil {
|
||||||
slog.Warn("ui dashboard: list source groups", "host_id", h.ID, "err", gerr)
|
slog.Warn("ui dashboard: list source groups", "host_id", h.ID, "err", gerr)
|
||||||
@@ -303,20 +276,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
since := time.Now().UTC().AddDate(0, 0, -30)
|
|
||||||
pts, herr := s.deps.Store.ListHostRepoStatsHistory(r.Context(), h.ID, since)
|
|
||||||
if herr != nil {
|
|
||||||
slog.Warn("ui dashboard: list repo history", "host_id", h.ID, "err", herr)
|
|
||||||
}
|
|
||||||
sparkPoints := make([]float64, len(pts))
|
|
||||||
for i, p := range pts {
|
|
||||||
if p.TotalSizeBytes == nil {
|
|
||||||
sparkPoints[i] = math.NaN()
|
|
||||||
} else {
|
|
||||||
sparkPoints[i] = float64(*p.TotalSizeBytes)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
row.RepoSparklineSVG = sparkline.RenderSparkline(sparkPoints, 88, 20)
|
|
||||||
rows = append(rows, row)
|
rows = append(rows, row)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -330,13 +289,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
|
|||||||
critOpenCount = len(crit)
|
critOpenCount = len(crit)
|
||||||
}
|
}
|
||||||
|
|
||||||
updatesBehind := 0
|
|
||||||
for _, h := range allHosts {
|
|
||||||
if h.Status == "online" && h.AgentVersion != "" && h.AgentVersion != version.Version {
|
|
||||||
updatesBehind++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
view := s.baseView(r, u)
|
view := s.baseView(r, u)
|
||||||
view.Page = dashboardPage{
|
view.Page = dashboardPage{
|
||||||
Hosts: rows,
|
Hosts: rows,
|
||||||
@@ -350,7 +302,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
|
|||||||
Filter: filter,
|
Filter: filter,
|
||||||
RefreshURL: "/?" + filter.encode(),
|
RefreshURL: "/?" + filter.encode(),
|
||||||
SortURL: buildDashboardSortURLs(filter),
|
SortURL: buildDashboardSortURLs(filter),
|
||||||
UpdatesBehind: updatesBehind,
|
|
||||||
}
|
}
|
||||||
if err := s.deps.UI.Render(w, "dashboard", view); err != nil {
|
if err := s.deps.UI.Render(w, "dashboard", view); err != nil {
|
||||||
slog.Error("ui: render dashboard", "err", err)
|
slog.Error("ui: render dashboard", "err", err)
|
||||||
@@ -369,7 +320,6 @@ func parseDashboardFilter(q url.Values) dashboardFilter {
|
|||||||
Tag: q.Get("tag"),
|
Tag: q.Get("tag"),
|
||||||
Sort: q.Get("sort"),
|
Sort: q.Get("sort"),
|
||||||
Dir: q.Get("dir"),
|
Dir: q.Get("dir"),
|
||||||
Updates: q.Get("updates"),
|
|
||||||
}
|
}
|
||||||
if f.Sort == "" {
|
if f.Sort == "" {
|
||||||
f.Sort = "name"
|
f.Sort = "name"
|
||||||
@@ -402,9 +352,6 @@ func (f dashboardFilter) encode() string {
|
|||||||
if f.Dir != "" && f.Dir != "asc" {
|
if f.Dir != "" && f.Dir != "asc" {
|
||||||
v.Set("dir", f.Dir)
|
v.Set("dir", f.Dir)
|
||||||
}
|
}
|
||||||
if f.Updates != "" {
|
|
||||||
v.Set("updates", f.Updates)
|
|
||||||
}
|
|
||||||
return v.Encode()
|
return v.Encode()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -455,11 +402,6 @@ func filterAndSortDashboardHosts(hosts []store.Host, f dashboardFilter) []store.
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if f.Updates == "behind" {
|
|
||||||
if h.AgentVersion == "" || h.AgentVersion == version.Version {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out = append(out, h)
|
out = append(out, h)
|
||||||
}
|
}
|
||||||
sortDashboardHosts(out, f.Sort, f.Dir)
|
sortDashboardHosts(out, f.Sort, f.Dir)
|
||||||
@@ -867,20 +809,6 @@ type hostChromeData struct {
|
|||||||
SourceGroupCount int
|
SourceGroupCount int
|
||||||
ScheduleCount int
|
ScheduleCount int
|
||||||
ScheduleVersion int64 // host_schedule_version (latest desired)
|
ScheduleVersion int64 // host_schedule_version (latest desired)
|
||||||
// UpdateAvailable + TargetVersion drive the agent-out-of-date chip
|
|
||||||
// in the host detail header. UpdateAvailable is true iff the host
|
|
||||||
// has connected at least once AND its agent_version != server's.
|
|
||||||
UpdateAvailable bool
|
|
||||||
TargetVersion string
|
|
||||||
// Online + UpdateInProgress drive the per-host "Update agent"
|
|
||||||
// button on host_detail. Online mirrors hub.Connected; pulled here
|
|
||||||
// so the button can disable when the host is unreachable.
|
|
||||||
Online bool
|
|
||||||
UpdateInProgress bool
|
|
||||||
// CanAdmin is true when the viewing user has admin role; used to
|
|
||||||
// gate the "Update agent" button. Kept on the chrome struct so any
|
|
||||||
// page reusing host_chrome already has it for free.
|
|
||||||
CanAdmin bool
|
|
||||||
// KnownTags is the union of tags already in use across the fleet,
|
// KnownTags is the union of tags already in use across the fleet,
|
||||||
// used for autocomplete on the host-tags edit form. Cheap query.
|
// used for autocomplete on the host-tags edit form. Cheap query.
|
||||||
KnownTags []string
|
KnownTags []string
|
||||||
@@ -906,14 +834,6 @@ type hostChromeData struct {
|
|||||||
// render the page with stale counts than 500 the whole tab.
|
// render the page with stale counts than 500 the whole tab.
|
||||||
func (s *Server) loadHostChrome(r *stdhttp.Request, host store.Host, subtab, crumb string) hostChromeData {
|
func (s *Server) loadHostChrome(r *stdhttp.Request, host store.Host, subtab, crumb string) hostChromeData {
|
||||||
d := hostChromeData{Host: host, SubTab: subtab, Crumb: crumb}
|
d := hostChromeData{Host: host, SubTab: subtab, Crumb: crumb}
|
||||||
d.TargetVersion = version.Version
|
|
||||||
d.UpdateAvailable = host.AgentVersion != "" && host.AgentVersion != version.Version
|
|
||||||
if s.deps.Hub != nil {
|
|
||||||
d.Online = s.deps.Hub.Connected(host.ID)
|
|
||||||
}
|
|
||||||
if existing, _ := s.deps.Store.RunningUpdateJobForHost(r.Context(), host.ID); existing != "" {
|
|
||||||
d.UpdateInProgress = true
|
|
||||||
}
|
|
||||||
if groups, err := s.deps.Store.ListSourceGroupsByHost(r.Context(), host.ID); err == nil {
|
if groups, err := s.deps.Store.ListSourceGroupsByHost(r.Context(), host.ID); err == nil {
|
||||||
d.SourceGroupCount = len(groups)
|
d.SourceGroupCount = len(groups)
|
||||||
} else {
|
} else {
|
||||||
@@ -1052,10 +972,8 @@ func (s *Server) handleUIHostDetail(w stdhttp.ResponseWriter, r *stdhttp.Request
|
|||||||
|
|
||||||
view := s.baseView(r, u)
|
view := s.baseView(r, u)
|
||||||
view.Title = host.Name + " · restic-manager"
|
view.Title = host.Name + " · restic-manager"
|
||||||
chrome := s.loadHostChrome(r, *host, "snapshots", "snapshots")
|
|
||||||
chrome.CanAdmin = u.Role == string(store.RoleAdmin)
|
|
||||||
view.Page = hostDetailPage{
|
view.Page = hostDetailPage{
|
||||||
hostChromeData: chrome,
|
hostChromeData: s.loadHostChrome(r, *host, "snapshots", "snapshots"),
|
||||||
Snapshots: shown,
|
Snapshots: shown,
|
||||||
SnapshotsShown: len(shown),
|
SnapshotsShown: len(shown),
|
||||||
LegacyRestic: !restic.Env{Version: host.ResticVersion}.AtLeastVersion(0, 17),
|
LegacyRestic: !restic.Env{Version: host.ResticVersion}.AtLeastVersion(0, 17),
|
||||||
|
|||||||
@@ -1,47 +0,0 @@
|
|||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"log/slog"
|
|
||||||
stdhttp "net/http"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
)
|
|
||||||
|
|
||||||
// hostJobsPage is the page-data struct for /hosts/{id}/jobs.
|
|
||||||
type hostJobsPage struct {
|
|
||||||
hostChromeData
|
|
||||||
Jobs []store.Job
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleUIHostJobs renders the per-host jobs list. Read-only — no
|
|
||||||
// actions, just a click-through to the existing /jobs/{id} detail
|
|
||||||
// page for any row.
|
|
||||||
func (s *Server) handleUIHostJobs(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
u := s.requireUIUser(w, r)
|
|
||||||
if u == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
host, ok := s.loadHostForUI(w, r)
|
|
||||||
if !ok {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
jobs, err := s.deps.Store.ListJobsByHost(r.Context(), host.ID, 100)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("ui host jobs: list", "host_id", host.ID, "err", err)
|
|
||||||
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
page := hostJobsPage{
|
|
||||||
hostChromeData: s.loadHostChrome(r, *host, "jobs", "jobs"),
|
|
||||||
Jobs: jobs,
|
|
||||||
}
|
|
||||||
view := s.baseView(r, u)
|
|
||||||
view.Title = host.Name + " jobs · restic-manager"
|
|
||||||
view.Page = page
|
|
||||||
if err := s.deps.UI.Render(w, "host_jobs", view); err != nil {
|
|
||||||
slog.Error("ui: render host_jobs", "err", err)
|
|
||||||
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,85 +0,0 @@
|
|||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"io"
|
|
||||||
stdhttp "net/http"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestUIHostJobs_RendersList(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, baseURL, st := newTestServerWithUI(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
hostID := makeHost(t, st, "h-jobs-render")
|
|
||||||
|
|
||||||
// Two jobs with distinct kinds + statuses.
|
|
||||||
now := time.Now().UTC()
|
|
||||||
ctx := context.Background()
|
|
||||||
if err := st.CreateJob(ctx, store.Job{
|
|
||||||
ID: "01HZZZZZZZZZZZZZZZZZZZZZ10", HostID: hostID, Kind: "backup",
|
|
||||||
ActorKind: "user", CreatedAt: now.Add(-time.Hour),
|
|
||||||
}); err != nil {
|
|
||||||
t.Fatalf("create job: %v", err)
|
|
||||||
}
|
|
||||||
if err := st.MarkJobFinished(ctx, "01HZZZZZZZZZZZZZZZZZZZZZ10", "succeeded", 0, nil, "", now.Add(-time.Hour+time.Minute)); err != nil {
|
|
||||||
t.Fatalf("finish job: %v", err)
|
|
||||||
}
|
|
||||||
if err := st.CreateJob(ctx, store.Job{
|
|
||||||
ID: "01HZZZZZZZZZZZZZZZZZZZZZ11", HostID: hostID, Kind: "prune",
|
|
||||||
ActorKind: "schedule", CreatedAt: now,
|
|
||||||
}); err != nil {
|
|
||||||
t.Fatalf("create job: %v", err)
|
|
||||||
}
|
|
||||||
if err := st.MarkJobFinished(ctx, "01HZZZZZZZZZZZZZZZZZZZZZ11", "failed", 1, nil, "boom", now.Add(time.Minute)); err != nil {
|
|
||||||
t.Fatalf("finish job: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
body := getHostJobsPage(t, baseURL, hostID, cookie)
|
|
||||||
for _, want := range []string{"backup", "prune", "succeeded", "failed", "schedule", "user", `class="jobs-row`} {
|
|
||||||
if !strings.Contains(body, want) {
|
|
||||||
t.Errorf("expected %q in body, missing", want)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUIHostJobs_EmptyState(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, baseURL, st := newTestServerWithUI(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
hostID := makeHost(t, st, "h-jobs-empty")
|
|
||||||
|
|
||||||
body := getHostJobsPage(t, baseURL, hostID, cookie)
|
|
||||||
if !strings.Contains(body, "No jobs yet.") {
|
|
||||||
t.Error("expected empty-state heading")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// getHostJobsPage fetches /hosts/{id}/jobs and returns the body string.
|
|
||||||
func getHostJobsPage(t *testing.T, baseURL, hostID string, cookie *stdhttp.Cookie) string {
|
|
||||||
t.Helper()
|
|
||||||
client := &stdhttp.Client{
|
|
||||||
CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
|
|
||||||
return stdhttp.ErrUseLastResponse
|
|
||||||
},
|
|
||||||
}
|
|
||||||
req, err := stdhttp.NewRequest("GET", baseURL+"/hosts/"+hostID+"/jobs", nil)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("new request: %v", err)
|
|
||||||
}
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := client.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("GET /hosts/%s/jobs: %v", hostID, err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusOK {
|
|
||||||
t.Fatalf("GET /hosts/%s/jobs: want 200, got %d", hostID, res.StatusCode)
|
|
||||||
}
|
|
||||||
raw, _ := io.ReadAll(res.Body)
|
|
||||||
return string(raw)
|
|
||||||
}
|
|
||||||
@@ -1,12 +1,9 @@
|
|||||||
package http
|
package http
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"html/template"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"math"
|
|
||||||
stdhttp "net/http"
|
stdhttp "net/http"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -16,7 +13,6 @@ import (
|
|||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/web/sparkline"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// ui_repo.go — HTML form-driven repo-tab handlers (connection,
|
// ui_repo.go — HTML form-driven repo-tab handlers (connection,
|
||||||
@@ -31,15 +27,6 @@ import (
|
|||||||
// POST /hosts/{id}/admin-credentials — admin (prune) creds
|
// POST /hosts/{id}/admin-credentials — admin (prune) creds
|
||||||
// POST /hosts/{id}/admin-credentials/delete — clear admin creds
|
// POST /hosts/{id}/admin-credentials/delete — clear admin creds
|
||||||
|
|
||||||
// repoTrendView is the data the repo_size_chart partial needs.
|
|
||||||
// HostID + Range round-trip through the htmx range pills; ChartSVG
|
|
||||||
// is pre-rendered server-side so the partial is just a wrapper.
|
|
||||||
type repoTrendView struct {
|
|
||||||
HostID string
|
|
||||||
Range string
|
|
||||||
ChartSVG template.HTML
|
|
||||||
}
|
|
||||||
|
|
||||||
// repoStatsView is a flat, pre-dereferenced projection of
|
// repoStatsView is a flat, pre-dereferenced projection of
|
||||||
// store.HostRepoStats for use in templates. Nil pointer fields are
|
// store.HostRepoStats for use in templates. Nil pointer fields are
|
||||||
// collapsed to zero/false and accompanied by a Has* sentinel so the
|
// collapsed to zero/false and accompanied by a Has* sentinel so the
|
||||||
@@ -87,10 +74,6 @@ type hostRepoPage struct {
|
|||||||
// Nil when no row exists yet (fresh hosts).
|
// Nil when no row exists yet (fresh hosts).
|
||||||
StatsView *repoStatsView
|
StatsView *repoStatsView
|
||||||
|
|
||||||
// Trend holds the pre-rendered chart fragment data for the
|
|
||||||
// 30/90/365-day repo-size + snapshot-count overlay chart.
|
|
||||||
Trend repoTrendView
|
|
||||||
|
|
||||||
// Snapshots-by-tag — map[group_name]count, plus an "untagged" row.
|
// Snapshots-by-tag — map[group_name]count, plus an "untagged" row.
|
||||||
SnapshotsByTag map[string]int
|
SnapshotsByTag map[string]int
|
||||||
UntaggedSnapshots int
|
UntaggedSnapshots int
|
||||||
@@ -242,52 +225,9 @@ func (s *Server) loadHostRepoPage(r *stdhttp.Request, host store.Host) (*hostRep
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
p.Trend = s.buildRepoTrendView(r.Context(), host.ID, "30d")
|
|
||||||
|
|
||||||
return p, nil
|
return p, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// buildRepoTrendView builds the chart-partial data for a host. Used
|
|
||||||
// both by the page-load (initial 30d render) and the htmx fragment
|
|
||||||
// endpoint (range switching). An invalid rangeKey falls back to "30d".
|
|
||||||
func (s *Server) buildRepoTrendView(ctx context.Context, hostID, rangeKey string) repoTrendView {
|
|
||||||
days := 30
|
|
||||||
switch rangeKey {
|
|
||||||
case "90d":
|
|
||||||
days = 90
|
|
||||||
case "1y":
|
|
||||||
days = 365
|
|
||||||
default:
|
|
||||||
rangeKey = "30d"
|
|
||||||
}
|
|
||||||
since := time.Now().UTC().AddDate(0, 0, -days)
|
|
||||||
pts, err := s.deps.Store.ListHostRepoStatsHistory(ctx, hostID, since)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("ui repo trend: list history", "host_id", hostID, "err", err)
|
|
||||||
}
|
|
||||||
sizes := make([]float64, len(pts))
|
|
||||||
counts := make([]float64, len(pts))
|
|
||||||
dayList := make([]time.Time, len(pts))
|
|
||||||
for i, p := range pts {
|
|
||||||
dayList[i] = p.Day
|
|
||||||
if p.TotalSizeBytes == nil {
|
|
||||||
sizes[i] = math.NaN()
|
|
||||||
} else {
|
|
||||||
sizes[i] = float64(*p.TotalSizeBytes)
|
|
||||||
}
|
|
||||||
if p.SnapshotCount == nil {
|
|
||||||
counts[i] = math.NaN()
|
|
||||||
} else {
|
|
||||||
counts[i] = float64(*p.SnapshotCount)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
chartSVG := sparkline.RenderChart([]sparkline.Series{
|
|
||||||
{Name: "size", Stroke: "#3b82f6", Axis: sparkline.AxisLeft, Format: sparkline.FormatBytes, Points: sizes},
|
|
||||||
{Name: "snapshots", Stroke: "#f59e0b", Axis: sparkline.AxisRight, Format: sparkline.FormatCount, Points: counts},
|
|
||||||
}, dayList, sparkline.ChartOpts{Width: 640, Height: 220})
|
|
||||||
return repoTrendView{HostID: hostID, Range: rangeKey, ChartSVG: chartSVG}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Server) handleUIHostRepo(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
func (s *Server) handleUIHostRepo(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
u := s.requireUIUser(w, r)
|
u := s.requireUIUser(w, r)
|
||||||
if u == nil {
|
if u == nil {
|
||||||
|
|||||||
@@ -1,25 +0,0 @@
|
|||||||
// ui_repo_trend.go — htmx fragment endpoint for the repo-page
|
|
||||||
// trend chart. Returns just the chart partial wrapped in
|
|
||||||
// <div id="repo-trend-chart"> so htmx can outerHTML-swap it.
|
|
||||||
//
|
|
||||||
// GET /hosts/{id}/repo/trend?range=30d|90d|1y
|
|
||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
stdhttp "net/http"
|
|
||||||
|
|
||||||
"github.com/go-chi/chi/v5"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (s *Server) handleUIRepoTrend(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
u := s.requireUIUser(w, r)
|
|
||||||
if u == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
hostID := chi.URLParam(r, "id")
|
|
||||||
view := s.baseView(r, u)
|
|
||||||
view.Page = s.buildRepoTrendView(r.Context(), hostID, r.URL.Query().Get("range"))
|
|
||||||
if err := s.deps.UI.RenderPartial(w, "repo_size_chart", view); err != nil {
|
|
||||||
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,123 +0,0 @@
|
|||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
stdhttp "net/http"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
)
|
|
||||||
|
|
||||||
func getTrend(t *testing.T, baseURL, hostID, rangeKey string, cookie *stdhttp.Cookie) string {
|
|
||||||
t.Helper()
|
|
||||||
client := &stdhttp.Client{
|
|
||||||
CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
|
|
||||||
return stdhttp.ErrUseLastResponse
|
|
||||||
},
|
|
||||||
}
|
|
||||||
url := baseURL + "/hosts/" + hostID + "/repo/trend"
|
|
||||||
if rangeKey != "" {
|
|
||||||
url += "?range=" + rangeKey
|
|
||||||
}
|
|
||||||
req, err := stdhttp.NewRequest("GET", url, nil)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("new request: %v", err)
|
|
||||||
}
|
|
||||||
req.AddCookie(cookie)
|
|
||||||
res, err := client.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("GET %s: %v", url, err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusOK {
|
|
||||||
t.Fatalf("GET %s: want 200, got %d", url, res.StatusCode)
|
|
||||||
}
|
|
||||||
body := make([]byte, 0, 1<<20)
|
|
||||||
buf := make([]byte, 4096)
|
|
||||||
for {
|
|
||||||
n, rerr := res.Body.Read(buf)
|
|
||||||
body = append(body, buf[:n]...)
|
|
||||||
if rerr != nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return string(body)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUIRepoTrend_30dRange(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, baseURL, st := newTestServerWithUI(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
hostID := makeHost(t, st, "h-trend")
|
|
||||||
ctx := context.Background()
|
|
||||||
|
|
||||||
now := time.Now().UTC()
|
|
||||||
for i := 0; i < 5; i++ {
|
|
||||||
day := now.AddDate(0, 0, -i).Format("2006-01-02")
|
|
||||||
v := int64(1000 + i*100)
|
|
||||||
c := int64(10 + i)
|
|
||||||
if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
|
|
||||||
store.HostRepoStats{TotalSizeBytes: &v, SnapshotCount: &c}, now); err != nil {
|
|
||||||
t.Fatalf("seed %s: %v", day, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
body := getTrend(t, baseURL, hostID, "30d", cookie)
|
|
||||||
if !strings.Contains(body, `class="repo-trend-chart"`) {
|
|
||||||
t.Errorf("expected repo-trend-chart SVG in fragment")
|
|
||||||
}
|
|
||||||
if !strings.Contains(body, `id="repo-trend-chart"`) {
|
|
||||||
t.Errorf("expected outer wrapper id=repo-trend-chart")
|
|
||||||
}
|
|
||||||
if !strings.Contains(body, `data-range="30d"`) {
|
|
||||||
t.Errorf("expected data-range=30d")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUIRepoTrend_InvalidRangeFallsBackTo30d(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, baseURL, st := newTestServerWithUI(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
hostID := makeHost(t, st, "h-trend2")
|
|
||||||
|
|
||||||
body := getTrend(t, baseURL, hostID, "banana", cookie)
|
|
||||||
if !strings.Contains(body, `data-range="30d"`) {
|
|
||||||
t.Errorf("expected data-range=30d on invalid range fallback")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestUIRepoPageRendersTrendPanel — full-page render path: seed 3
|
|
||||||
// history rows, fetch /hosts/{id}/repo, assert the Trend panel with
|
|
||||||
// SVG chart ID, class, and heading text appear embedded in the page.
|
|
||||||
func TestUIRepoPageRendersTrendPanel(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
_, baseURL, st := newTestServerWithUI(t)
|
|
||||||
cookie := loginAsAdmin(t, st)
|
|
||||||
hostID := makeHost(t, st, "h-trend-page")
|
|
||||||
ctx := context.Background()
|
|
||||||
|
|
||||||
now := time.Now().UTC()
|
|
||||||
for i := 0; i < 3; i++ {
|
|
||||||
day := now.AddDate(0, 0, -i).Format("2006-01-02")
|
|
||||||
v := int64(2000 + i*200)
|
|
||||||
c := int64(20 + i)
|
|
||||||
if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
|
|
||||||
store.HostRepoStats{TotalSizeBytes: &v, SnapshotCount: &c}, now); err != nil {
|
|
||||||
t.Fatalf("seed %s: %v", day, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
body := getRepoPage(t, baseURL, hostID, cookie)
|
|
||||||
|
|
||||||
if !strings.Contains(body, `id="repo-trend-chart"`) {
|
|
||||||
t.Errorf("expected id=\"repo-trend-chart\" in full-page render")
|
|
||||||
}
|
|
||||||
if !strings.Contains(body, `class="repo-trend-chart"`) {
|
|
||||||
t.Errorf("expected class=\"repo-trend-chart\" in full-page render")
|
|
||||||
}
|
|
||||||
if !strings.Contains(body, ">Trend<") {
|
|
||||||
t.Errorf("expected panel heading '>Trend<' in full-page render")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"encoding/json"
|
|
||||||
stdhttp "net/http"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
|
||||||
|
|
||||||
// handleVersion exposes the server's build-time identifying constants
|
|
||||||
// (set via -ldflags). Public-band — no secrets surface here, the agent
|
|
||||||
// updater compares its own agent_version byte-for-byte against the
|
|
||||||
// Version field to drive the "out of date" signal.
|
|
||||||
func (s *Server) handleVersion(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
_ = json.NewEncoder(w).Encode(map[string]string{
|
|
||||||
"version": version.Version,
|
|
||||||
"commit": version.Commit,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
package http
|
|
||||||
|
|
||||||
import (
|
|
||||||
"encoding/json"
|
|
||||||
stdhttp "net/http"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestVersionEndpoint(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
prevV, prevC := version.Version, version.Commit
|
|
||||||
version.Version = "v9.9.9-test"
|
|
||||||
version.Commit = "abc1234"
|
|
||||||
t.Cleanup(func() {
|
|
||||||
version.Version = prevV
|
|
||||||
version.Commit = prevC
|
|
||||||
})
|
|
||||||
|
|
||||||
_, url, _ := newTestServerWithHub(t)
|
|
||||||
|
|
||||||
res, err := stdhttp.Get(url + "/api/version")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("get: %v", err)
|
|
||||||
}
|
|
||||||
defer res.Body.Close()
|
|
||||||
if res.StatusCode != stdhttp.StatusOK {
|
|
||||||
t.Fatalf("status: got %d want 200", res.StatusCode)
|
|
||||||
}
|
|
||||||
var body map[string]string
|
|
||||||
if err := json.NewDecoder(res.Body).Decode(&body); err != nil {
|
|
||||||
t.Fatalf("decode: %v", err)
|
|
||||||
}
|
|
||||||
if body["version"] != "v9.9.9-test" {
|
|
||||||
t.Fatalf("version: got %q", body["version"])
|
|
||||||
}
|
|
||||||
if body["commit"] != "abc1234" {
|
|
||||||
t.Fatalf("commit: got %q", body["commit"])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -75,28 +75,6 @@ func funcMap() template.FuncMap {
|
|||||||
return *p
|
return *p
|
||||||
},
|
},
|
||||||
"sub": func(a, b int) int { return a - b },
|
"sub": func(a, b int) int { return a - b },
|
||||||
// durationHuman formats the elapsed time between two *time.Time
|
|
||||||
// values as a short human string: "350ms", "4.2s", "2m 15s",
|
|
||||||
// "1h 4m". Returns "—" when either pointer is nil.
|
|
||||||
"durationHuman": func(start, end *time.Time) string {
|
|
||||||
if start == nil || end == nil {
|
|
||||||
return "—"
|
|
||||||
}
|
|
||||||
d := end.Sub(*start)
|
|
||||||
if d < 0 {
|
|
||||||
d = -d
|
|
||||||
}
|
|
||||||
if d < time.Second {
|
|
||||||
return fmt.Sprintf("%dms", d.Milliseconds())
|
|
||||||
}
|
|
||||||
if d < time.Minute {
|
|
||||||
return fmt.Sprintf("%.1fs", d.Seconds())
|
|
||||||
}
|
|
||||||
if d < time.Hour {
|
|
||||||
return fmt.Sprintf("%dm %ds", int(d.Minutes()), int(d.Seconds())%60)
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("%dh %dm", int(d.Hours()), int(d.Minutes())%60)
|
|
||||||
},
|
|
||||||
// joinComma joins a slice with ", ". Used by the schedule list
|
// joinComma joins a slice with ", ". Used by the schedule list
|
||||||
// to render retention summaries.
|
// to render retention summaries.
|
||||||
"joinComma": func(parts []string) string { return strings.Join(parts, ", ") },
|
"joinComma": func(parts []string) string { return strings.Join(parts, ", ") },
|
||||||
|
|||||||
@@ -108,9 +108,6 @@ func New() (*Renderer, error) {
|
|||||||
"templates/partials/tree_node.html",
|
"templates/partials/tree_node.html",
|
||||||
"templates/partials/alert_row.html",
|
"templates/partials/alert_row.html",
|
||||||
"templates/partials/crit_banner.html",
|
"templates/partials/crit_banner.html",
|
||||||
"templates/partials/fleet_update_inner.html",
|
|
||||||
"templates/partials/host_update_chip.html",
|
|
||||||
"templates/partials/repo_size_chart.html",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pageEntries, err := fs.Glob(web.FS, "templates/pages/*.html")
|
pageEntries, err := fs.Glob(web.FS, "templates/pages/*.html")
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ import (
|
|||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// HandlerDeps is the set of collaborators the agent WS handler needs.
|
// HandlerDeps is the set of collaborators the agent WS handler needs.
|
||||||
@@ -27,9 +26,6 @@ type HandlerDeps struct {
|
|||||||
// AlertEngine receives job-finished and host-online events so the
|
// AlertEngine receives job-finished and host-online events so the
|
||||||
// alert engine can evaluate its rules. Optional; nil = no-op.
|
// alert engine can evaluate its rules. Optional; nil = no-op.
|
||||||
AlertEngine *alert.Engine
|
AlertEngine *alert.Engine
|
||||||
// UpdateWatcher reconciles in-flight agent-update dispatches against
|
|
||||||
// hello envelopes. Optional; nil = no-op.
|
|
||||||
UpdateWatcher *UpdateWatcher
|
|
||||||
// OnHello is called once per successful hello, after the host row
|
// OnHello is called once per successful hello, after the host row
|
||||||
// has been touched and the conn registered. Used by the HTTP
|
// has been touched and the conn registered. Used by the HTTP
|
||||||
// layer to push host_credentials down as a config.update before
|
// layer to push host_credentials down as a config.update before
|
||||||
@@ -151,9 +147,6 @@ func runAgentLoop(ctx context.Context, c *Conn, hostID string, deps HandlerDeps)
|
|||||||
if deps.AlertEngine != nil {
|
if deps.AlertEngine != nil {
|
||||||
deps.AlertEngine.NotifyHostOnline(hostID)
|
deps.AlertEngine.NotifyHostOnline(hostID)
|
||||||
}
|
}
|
||||||
if deps.UpdateWatcher != nil {
|
|
||||||
deps.UpdateWatcher.OnHello(ctx, hostID, helloPayload.AgentVersion, version.Version)
|
|
||||||
}
|
|
||||||
|
|
||||||
deps.Hub.Register(hostID, c)
|
deps.Hub.Register(hostID, c)
|
||||||
defer deps.Hub.Unregister(hostID, c)
|
defer deps.Hub.Unregister(hostID, c)
|
||||||
@@ -227,17 +220,11 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
|
|||||||
// a *success* — restic's idempotent init returns that when the
|
// a *success* — restic's idempotent init returns that when the
|
||||||
// repo is already initialised, which is the happy path for
|
// repo is already initialised, which is the happy path for
|
||||||
// onboarding against an existing repo.
|
// onboarding against an existing repo.
|
||||||
if job, err := deps.Store.GetJob(ctx, p.JobID); err == nil && job != nil {
|
if job, err := deps.Store.GetJob(ctx, p.JobID); err == nil && job != nil &&
|
||||||
switch job.Kind {
|
job.Kind == string(api.JobInit) {
|
||||||
case string(api.JobInit):
|
status, errOut := repoStatusFromInit(string(p.Status), errMsg)
|
||||||
status, errOut := repoStatusFromInit(string(p.Status), errMsg)
|
if err := deps.Store.SetHostRepoStatus(ctx, hostID, status, errOut); err != nil {
|
||||||
if err := deps.Store.SetHostRepoStatus(ctx, hostID, status, errOut); err != nil {
|
slog.Warn("ws: set host repo status", "host_id", hostID, "err", err)
|
||||||
slog.Warn("ws: set host repo status", "host_id", hostID, "err", err)
|
|
||||||
}
|
|
||||||
case string(api.JobBackup):
|
|
||||||
if err := deps.Store.SetHostLastBackup(ctx, hostID, string(p.Status), p.FinishedAt); err != nil {
|
|
||||||
slog.Warn("ws: set host last backup", "host_id", hostID, "err", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if deps.JobHub != nil {
|
if deps.JobHub != nil {
|
||||||
@@ -339,10 +326,6 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
|
|||||||
} else {
|
} else {
|
||||||
slog.Info("ws: repo stats refreshed", "host_id", hostID)
|
slog.Info("ws: repo stats refreshed", "host_id", hostID)
|
||||||
}
|
}
|
||||||
day := time.Now().UTC().Format("2006-01-02")
|
|
||||||
if err := deps.Store.UpsertHostRepoStatsHistory(ctx, hostID, day, patch, time.Now().UTC()); err != nil {
|
|
||||||
slog.Warn("ws: upsert host repo stats history", "host_id", hostID, "err", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
case api.MsgCommandResult:
|
case api.MsgCommandResult:
|
||||||
// TODO(P2): persist command.result acks for "did the agent
|
// TODO(P2): persist command.result acks for "did the agent
|
||||||
|
|||||||
@@ -133,42 +133,3 @@ func TestRepoStatsReportPartialUpdate(t *testing.T) {
|
|||||||
t.Errorf("LastCheckStatus: got %q want ok", got.LastCheckStatus)
|
t.Errorf("LastCheckStatus: got %q want ok", got.LastCheckStatus)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRepoStatsReportWritesHistoryRow(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
s := openWSTestStore(t)
|
|
||||||
ctx := context.Background()
|
|
||||||
|
|
||||||
const hostID = "h-stats-history"
|
|
||||||
seedHostWS(t, s, hostID)
|
|
||||||
|
|
||||||
payload := api.RepoStatsPayload{
|
|
||||||
TotalSizeBytes: int64ptrWS(12345),
|
|
||||||
SnapshotCount: int64ptrWS(7),
|
|
||||||
}
|
|
||||||
env, err := api.Marshal(api.MsgRepoStats, "", payload)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("marshal: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
deps := HandlerDeps{Store: s}
|
|
||||||
dispatchAgentMessage(ctx, nil, hostID, env, deps)
|
|
||||||
|
|
||||||
pts, err := s.ListHostRepoStatsHistory(ctx, hostID, time.Time{})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("list history: %v", err)
|
|
||||||
}
|
|
||||||
if len(pts) != 1 {
|
|
||||||
t.Fatalf("want 1 history row, got %d", len(pts))
|
|
||||||
}
|
|
||||||
wantDay := time.Now().UTC().Format("2006-01-02")
|
|
||||||
if got := pts[0].Day.Format("2006-01-02"); got != wantDay {
|
|
||||||
t.Errorf("day: want %s, got %s", wantDay, got)
|
|
||||||
}
|
|
||||||
if pts[0].TotalSizeBytes == nil || *pts[0].TotalSizeBytes != 12345 {
|
|
||||||
t.Errorf("TotalSizeBytes: want 12345, got %v", pts[0].TotalSizeBytes)
|
|
||||||
}
|
|
||||||
if pts[0].SnapshotCount == nil || *pts[0].SnapshotCount != 7 {
|
|
||||||
t.Errorf("SnapshotCount: want 7, got %v", pts[0].SnapshotCount)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,184 +0,0 @@
|
|||||||
package ws
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
)
|
|
||||||
|
|
||||||
// updateTimeout bounds how long the watcher waits for an agent to come
|
|
||||||
// back with its new version after a command.update dispatch. var (not
|
|
||||||
// const) so tests can shrink it.
|
|
||||||
var updateTimeout = 90 * time.Second
|
|
||||||
|
|
||||||
// AlertRaiser is the slim subset of *alert.Engine the update watcher
|
|
||||||
// touches. Defined here (not in the alert package) so the dependency
|
|
||||||
// arrow points the right way.
|
|
||||||
type AlertRaiser interface {
|
|
||||||
RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time)
|
|
||||||
ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time)
|
|
||||||
}
|
|
||||||
|
|
||||||
// UpdateWatcher tracks in-flight agent-update dispatches and reconciles
|
|
||||||
// them against incoming hello envelopes. Entries land on Track and
|
|
||||||
// resolve via OnHello (success path) or the periodic sweep (timeout).
|
|
||||||
type UpdateWatcher struct {
|
|
||||||
store *store.Store
|
|
||||||
alerts AlertRaiser
|
|
||||||
jobHub *JobHub // optional — if nil, no fan-out to browser streams
|
|
||||||
|
|
||||||
mu sync.Mutex
|
|
||||||
entries map[string]*updateEntry // hostID → entry
|
|
||||||
|
|
||||||
tickPeriod time.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
type updateEntry struct {
|
|
||||||
jobID string
|
|
||||||
startedAt time.Time
|
|
||||||
// terminated is set once the entry has reached a terminal state so
|
|
||||||
// late OnHellos don't resurrect it.
|
|
||||||
terminated bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewUpdateWatcher builds an unstarted watcher. Call Run in a goroutine
|
|
||||||
// to start the periodic sweep.
|
|
||||||
func NewUpdateWatcher(st *store.Store, alerts AlertRaiser, jobHub *JobHub) *UpdateWatcher {
|
|
||||||
return &UpdateWatcher{
|
|
||||||
store: st,
|
|
||||||
alerts: alerts,
|
|
||||||
jobHub: jobHub,
|
|
||||||
entries: make(map[string]*updateEntry),
|
|
||||||
tickPeriod: 5 * time.Second,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track registers a freshly-dispatched update job. A subsequent Track
|
|
||||||
// for the same host replaces the prior entry (last-write-wins).
|
|
||||||
func (w *UpdateWatcher) Track(jobID, hostID string) {
|
|
||||||
if w == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
w.mu.Lock()
|
|
||||||
w.entries[hostID] = &updateEntry{jobID: jobID, startedAt: time.Now()}
|
|
||||||
w.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
// OnHello is called by the WS handler after a successful hello has been
|
|
||||||
// persisted. If a tracked update for the host matches the targetVersion,
|
|
||||||
// the job is marked succeeded and any open update_failed alert is
|
|
||||||
// auto-resolved. A non-matching version is a no-op (the watcher keeps
|
|
||||||
// waiting until the timeout).
|
|
||||||
func (w *UpdateWatcher) OnHello(ctx context.Context, hostID, agentVersion, targetVersion string) {
|
|
||||||
if w == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
w.mu.Lock()
|
|
||||||
e, ok := w.entries[hostID]
|
|
||||||
if !ok || e.terminated {
|
|
||||||
w.mu.Unlock()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if agentVersion != targetVersion {
|
|
||||||
// Not the version we asked for — keep waiting.
|
|
||||||
w.mu.Unlock()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
e.terminated = true
|
|
||||||
jobID := e.jobID
|
|
||||||
delete(w.entries, hostID)
|
|
||||||
w.mu.Unlock()
|
|
||||||
|
|
||||||
now := time.Now().UTC()
|
|
||||||
if err := w.store.MarkJobFinished(ctx, jobID, "succeeded", 0, nil, "", now); err != nil {
|
|
||||||
slog.Warn("ws update watcher: mark succeeded", "job_id", jobID, "host_id", hostID, "err", err)
|
|
||||||
}
|
|
||||||
w.publishJobFinished(jobID, api.JobSucceeded, 0, "", now)
|
|
||||||
if w.alerts != nil {
|
|
||||||
w.alerts.ResolveUpdateFailed(ctx, hostID, now)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Run drives the periodic sweep. Returns when ctx is done.
|
|
||||||
func (w *UpdateWatcher) Run(ctx context.Context) {
|
|
||||||
if w == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
t := time.NewTicker(w.tickPeriod)
|
|
||||||
defer t.Stop()
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
case now := <-t.C:
|
|
||||||
w.sweep(ctx, now)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w *UpdateWatcher) sweep(ctx context.Context, now time.Time) {
|
|
||||||
type expired struct {
|
|
||||||
hostID string
|
|
||||||
jobID string
|
|
||||||
age time.Duration
|
|
||||||
}
|
|
||||||
var toFail []expired
|
|
||||||
w.mu.Lock()
|
|
||||||
for hostID, e := range w.entries {
|
|
||||||
if e.terminated {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if now.Sub(e.startedAt) >= updateTimeout {
|
|
||||||
toFail = append(toFail, expired{hostID: hostID, jobID: e.jobID, age: now.Sub(e.startedAt)})
|
|
||||||
e.terminated = true
|
|
||||||
delete(w.entries, hostID)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
w.mu.Unlock()
|
|
||||||
|
|
||||||
for _, x := range toFail {
|
|
||||||
reason := fmt.Sprintf("timeout: agent did not reconnect within %s", updateTimeout)
|
|
||||||
stamp := now.UTC()
|
|
||||||
errMsg := reason
|
|
||||||
if err := w.store.MarkJobFinished(ctx, x.jobID, "failed", -1, nil, errMsg, stamp); err != nil {
|
|
||||||
slog.Warn("ws update watcher: mark failed", "job_id", x.jobID, "host_id", x.hostID, "err", err)
|
|
||||||
}
|
|
||||||
w.publishJobFinished(x.jobID, api.JobFailed, -1, errMsg, stamp)
|
|
||||||
if w.alerts != nil {
|
|
||||||
w.alerts.RaiseUpdateFailed(ctx, x.hostID, x.jobID, reason, stamp)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// publishJobFinished pushes a synthetic job.finished envelope into the
|
|
||||||
// JobHub so any browser still streaming this job sees it terminate.
|
|
||||||
// The agent itself exits before it can send job.finished (it has to —
|
|
||||||
// it's about to relaunch into the new binary), so without this fan-out
|
|
||||||
// the /jobs/{id} page hangs until reload.
|
|
||||||
//
|
|
||||||
// Best-effort: if the hub is nil or the envelope can't be marshalled
|
|
||||||
// we log and move on — the DB-side state is already correct, this is
|
|
||||||
// purely a UI wake-up.
|
|
||||||
func (w *UpdateWatcher) publishJobFinished(jobID string, status api.JobStatus, exitCode int, errMsg string, finishedAt time.Time) {
|
|
||||||
if w.jobHub == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
payload := api.JobFinishedPayload{
|
|
||||||
JobID: jobID,
|
|
||||||
Status: status,
|
|
||||||
ExitCode: exitCode,
|
|
||||||
FinishedAt: finishedAt,
|
|
||||||
Error: errMsg,
|
|
||||||
}
|
|
||||||
env, err := api.Marshal(api.MsgJobFinished, "", payload)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("ws update watcher: marshal synthetic job.finished", "job_id", jobID, "err", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
w.jobHub.Broadcast(jobID, env)
|
|
||||||
}
|
|
||||||
@@ -1,230 +0,0 @@
|
|||||||
package ws
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"sync"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/oklog/ulid/v2"
|
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
||||||
)
|
|
||||||
|
|
||||||
type fakeAlerts struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
raised []string // hostIDs
|
|
||||||
resolved []string
|
|
||||||
reasons []string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *fakeAlerts) RaiseUpdateFailed(_ context.Context, hostID, _ /*jobID*/, reason string, _ time.Time) {
|
|
||||||
f.mu.Lock()
|
|
||||||
defer f.mu.Unlock()
|
|
||||||
f.raised = append(f.raised, hostID)
|
|
||||||
f.reasons = append(f.reasons, reason)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *fakeAlerts) ResolveUpdateFailed(_ context.Context, hostID string, _ time.Time) {
|
|
||||||
f.mu.Lock()
|
|
||||||
defer f.mu.Unlock()
|
|
||||||
f.resolved = append(f.resolved, hostID)
|
|
||||||
}
|
|
||||||
|
|
||||||
func seedJob(t *testing.T, st *store.Store, hostID string) string {
|
|
||||||
t.Helper()
|
|
||||||
jobID := ulid.Make().String()
|
|
||||||
if err := st.CreateJob(context.Background(), store.Job{
|
|
||||||
ID: jobID, HostID: hostID, Kind: "update",
|
|
||||||
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
|
||||||
}); err != nil {
|
|
||||||
t.Fatalf("create job: %v", err)
|
|
||||||
}
|
|
||||||
return jobID
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateWatcherOnHelloSuccess(t *testing.T) {
|
|
||||||
st := openWSTestStore(t)
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
seedHostWS(t, st, hostID)
|
|
||||||
jobID := seedJob(t, st, hostID)
|
|
||||||
|
|
||||||
a := &fakeAlerts{}
|
|
||||||
w := NewUpdateWatcher(st, a, nil)
|
|
||||||
w.Track(jobID, hostID)
|
|
||||||
|
|
||||||
w.OnHello(context.Background(), hostID, "v2", "v2")
|
|
||||||
|
|
||||||
job, err := st.GetJob(context.Background(), jobID)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("get job: %v", err)
|
|
||||||
}
|
|
||||||
if job.Status != "succeeded" {
|
|
||||||
t.Fatalf("status: got %q want succeeded", job.Status)
|
|
||||||
}
|
|
||||||
a.mu.Lock()
|
|
||||||
defer a.mu.Unlock()
|
|
||||||
if len(a.resolved) != 1 || a.resolved[0] != hostID {
|
|
||||||
t.Fatalf("resolve calls: %v", a.resolved)
|
|
||||||
}
|
|
||||||
if len(a.raised) != 0 {
|
|
||||||
t.Fatalf("unexpected raises: %v", a.raised)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateWatcherTimeout(t *testing.T) {
|
|
||||||
prev := updateTimeout
|
|
||||||
updateTimeout = 50 * time.Millisecond
|
|
||||||
t.Cleanup(func() { updateTimeout = prev })
|
|
||||||
|
|
||||||
st := openWSTestStore(t)
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
seedHostWS(t, st, hostID)
|
|
||||||
jobID := seedJob(t, st, hostID)
|
|
||||||
|
|
||||||
a := &fakeAlerts{}
|
|
||||||
w := NewUpdateWatcher(st, a, nil)
|
|
||||||
w.Track(jobID, hostID)
|
|
||||||
|
|
||||||
time.Sleep(80 * time.Millisecond)
|
|
||||||
w.sweep(context.Background(), time.Now())
|
|
||||||
|
|
||||||
job, err := st.GetJob(context.Background(), jobID)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("get job: %v", err)
|
|
||||||
}
|
|
||||||
if job.Status != "failed" {
|
|
||||||
t.Fatalf("status: got %q want failed", job.Status)
|
|
||||||
}
|
|
||||||
a.mu.Lock()
|
|
||||||
defer a.mu.Unlock()
|
|
||||||
if len(a.raised) != 1 || a.raised[0] != hostID {
|
|
||||||
t.Fatalf("raise calls: %v", a.raised)
|
|
||||||
}
|
|
||||||
if len(a.reasons) == 0 || a.reasons[0] == "" {
|
|
||||||
t.Fatalf("missing reason")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateWatcherMismatchedVersionNoOp(t *testing.T) {
|
|
||||||
st := openWSTestStore(t)
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
seedHostWS(t, st, hostID)
|
|
||||||
jobID := seedJob(t, st, hostID)
|
|
||||||
|
|
||||||
a := &fakeAlerts{}
|
|
||||||
w := NewUpdateWatcher(st, a, nil)
|
|
||||||
w.Track(jobID, hostID)
|
|
||||||
|
|
||||||
w.OnHello(context.Background(), hostID, "v1", "v2")
|
|
||||||
|
|
||||||
job, _ := st.GetJob(context.Background(), jobID)
|
|
||||||
if job.Status == "succeeded" || job.Status == "failed" {
|
|
||||||
t.Fatalf("status flipped on mismatched hello: %q", job.Status)
|
|
||||||
}
|
|
||||||
a.mu.Lock()
|
|
||||||
defer a.mu.Unlock()
|
|
||||||
if len(a.raised) != 0 || len(a.resolved) != 0 {
|
|
||||||
t.Fatalf("unexpected alert calls raised=%v resolved=%v", a.raised, a.resolved)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateWatcherHelloAfterTimeoutIsNoOp(t *testing.T) {
|
|
||||||
prev := updateTimeout
|
|
||||||
updateTimeout = 50 * time.Millisecond
|
|
||||||
t.Cleanup(func() { updateTimeout = prev })
|
|
||||||
|
|
||||||
st := openWSTestStore(t)
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
seedHostWS(t, st, hostID)
|
|
||||||
jobID := seedJob(t, st, hostID)
|
|
||||||
|
|
||||||
a := &fakeAlerts{}
|
|
||||||
w := NewUpdateWatcher(st, a, nil)
|
|
||||||
w.Track(jobID, hostID)
|
|
||||||
|
|
||||||
time.Sleep(80 * time.Millisecond)
|
|
||||||
w.sweep(context.Background(), time.Now())
|
|
||||||
|
|
||||||
// Hello arrives after sweep — entry already gone, must be no-op.
|
|
||||||
w.OnHello(context.Background(), hostID, "v2", "v2")
|
|
||||||
|
|
||||||
job, _ := st.GetJob(context.Background(), jobID)
|
|
||||||
if job.Status != "failed" {
|
|
||||||
t.Fatalf("status flipped from failed → %q", job.Status)
|
|
||||||
}
|
|
||||||
a.mu.Lock()
|
|
||||||
defer a.mu.Unlock()
|
|
||||||
if len(a.resolved) != 0 {
|
|
||||||
t.Fatalf("late hello triggered ResolveUpdateFailed: %v", a.resolved)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateWatcherOnHelloBroadcastsJobFinished(t *testing.T) {
|
|
||||||
st := openWSTestStore(t)
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
seedHostWS(t, st, hostID)
|
|
||||||
jobID := seedJob(t, st, hostID)
|
|
||||||
|
|
||||||
hub := NewJobHub()
|
|
||||||
sub := hub.Register(jobID)
|
|
||||||
defer sub.unregister()
|
|
||||||
|
|
||||||
w := NewUpdateWatcher(st, &fakeAlerts{}, hub)
|
|
||||||
w.Track(jobID, hostID)
|
|
||||||
w.OnHello(context.Background(), hostID, "v2", "v2")
|
|
||||||
|
|
||||||
select {
|
|
||||||
case env := <-sub.ch:
|
|
||||||
if env.Type != api.MsgJobFinished {
|
|
||||||
t.Fatalf("envelope type: got %q want %q", env.Type, api.MsgJobFinished)
|
|
||||||
}
|
|
||||||
var p api.JobFinishedPayload
|
|
||||||
if err := env.UnmarshalPayload(&p); err != nil {
|
|
||||||
t.Fatalf("unmarshal payload: %v", err)
|
|
||||||
}
|
|
||||||
if p.JobID != jobID || p.Status != api.JobSucceeded {
|
|
||||||
t.Fatalf("payload: got %+v", p)
|
|
||||||
}
|
|
||||||
case <-time.After(time.Second):
|
|
||||||
t.Fatal("expected synthetic job.finished broadcast, got nothing")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateWatcherTimeoutBroadcastsJobFinished(t *testing.T) {
|
|
||||||
prev := updateTimeout
|
|
||||||
updateTimeout = 50 * time.Millisecond
|
|
||||||
t.Cleanup(func() { updateTimeout = prev })
|
|
||||||
|
|
||||||
st := openWSTestStore(t)
|
|
||||||
hostID := ulid.Make().String()
|
|
||||||
seedHostWS(t, st, hostID)
|
|
||||||
jobID := seedJob(t, st, hostID)
|
|
||||||
|
|
||||||
hub := NewJobHub()
|
|
||||||
sub := hub.Register(jobID)
|
|
||||||
defer sub.unregister()
|
|
||||||
|
|
||||||
w := NewUpdateWatcher(st, &fakeAlerts{}, hub)
|
|
||||||
w.Track(jobID, hostID)
|
|
||||||
|
|
||||||
time.Sleep(80 * time.Millisecond)
|
|
||||||
w.sweep(context.Background(), time.Now())
|
|
||||||
|
|
||||||
select {
|
|
||||||
case env := <-sub.ch:
|
|
||||||
if env.Type != api.MsgJobFinished {
|
|
||||||
t.Fatalf("envelope type: got %q want %q", env.Type, api.MsgJobFinished)
|
|
||||||
}
|
|
||||||
var p api.JobFinishedPayload
|
|
||||||
if err := env.UnmarshalPayload(&p); err != nil {
|
|
||||||
t.Fatalf("unmarshal payload: %v", err)
|
|
||||||
}
|
|
||||||
if p.JobID != jobID || p.Status != api.JobFailed {
|
|
||||||
t.Fatalf("payload: got %+v", p)
|
|
||||||
}
|
|
||||||
case <-time.After(time.Second):
|
|
||||||
t.Fatal("expected synthetic job.finished broadcast, got nothing")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -77,56 +77,6 @@ func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, dedupKey, severi
|
|||||||
return id, true, nil
|
return id, true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// RaiseOrTouchSystem is the host-less variant of RaiseOrTouch — the
|
|
||||||
// alert row's host_id is stored as NULL, so the FK to hosts is bypassed.
|
|
||||||
// Used by fleet-wide alerts (e.g. fleet_update_halted) where the
|
|
||||||
// failure surface isn't pinned to a single host.
|
|
||||||
func (s *Store) RaiseOrTouchSystem(ctx context.Context, kind, dedupKey, severity, message string, when time.Time) (id string, didRaise bool, err error) {
|
|
||||||
tx, err := s.db.BeginTx(ctx, nil)
|
|
||||||
if err != nil {
|
|
||||||
return "", false, fmt.Errorf("store: begin: %w", err)
|
|
||||||
}
|
|
||||||
defer func() { _ = tx.Rollback() }()
|
|
||||||
|
|
||||||
row := tx.QueryRowContext(ctx,
|
|
||||||
`SELECT id FROM alerts
|
|
||||||
WHERE host_id IS NULL AND kind = ? AND dedup_key = ? AND resolved_at IS NULL
|
|
||||||
LIMIT 1`,
|
|
||||||
kind, dedupKey)
|
|
||||||
var existing string
|
|
||||||
switch err := row.Scan(&existing); {
|
|
||||||
case err == nil:
|
|
||||||
_, uerr := tx.ExecContext(ctx,
|
|
||||||
`UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`,
|
|
||||||
when.UTC().Format(time.RFC3339Nano), message, existing)
|
|
||||||
if uerr != nil {
|
|
||||||
return "", false, fmt.Errorf("store: touch alert: %w", uerr)
|
|
||||||
}
|
|
||||||
if err := tx.Commit(); err != nil {
|
|
||||||
return "", false, err
|
|
||||||
}
|
|
||||||
return existing, false, nil
|
|
||||||
case errors.Is(err, sql.ErrNoRows):
|
|
||||||
// fall through to insert
|
|
||||||
default:
|
|
||||||
return "", false, fmt.Errorf("store: lookup alert: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
id = ulid.Make().String()
|
|
||||||
whenStr := when.UTC().Format(time.RFC3339Nano)
|
|
||||||
_, err = tx.ExecContext(ctx,
|
|
||||||
`INSERT INTO alerts (id, host_id, kind, dedup_key, severity, message, created_at, last_seen_at)
|
|
||||||
VALUES (?, NULL, ?, ?, ?, ?, ?, ?)`,
|
|
||||||
id, kind, dedupKey, severity, message, whenStr, whenStr)
|
|
||||||
if err != nil {
|
|
||||||
return "", false, fmt.Errorf("store: insert alert: %w", err)
|
|
||||||
}
|
|
||||||
if err := tx.Commit(); err != nil {
|
|
||||||
return "", false, err
|
|
||||||
}
|
|
||||||
return id, true, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// refreshHostOpenAlertCount recomputes hosts.open_alert_count from the
|
// refreshHostOpenAlertCount recomputes hosts.open_alert_count from the
|
||||||
// alerts table for one host. Self-healing: idempotent and survives
|
// alerts table for one host. Self-healing: idempotent and survives
|
||||||
// out-of-order edits. Best-effort — errors are returned but callers
|
// out-of-order edits. Best-effort — errors are returned but callers
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user