diff --git a/.gitea/PULL_REQUEST_TEMPLATE.md b/.gitea/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..19a774b --- /dev/null +++ b/.gitea/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,32 @@ + + +## Summary + + + +## Test plan + + + +## Notes for the reviewer + + + +## Linked issues + + diff --git a/.gitea/issue_template/bug_report.md b/.gitea/issue_template/bug_report.md new file mode 100644 index 0000000..d73ce40 --- /dev/null +++ b/.gitea/issue_template/bug_report.md @@ -0,0 +1,52 @@ +--- +name: Bug report +about: Something isn't behaving the way the docs / code suggest it should +title: "[bug] " +labels: bug +--- + +## What happened + + + +## What you expected + + + +## Steps to reproduce + +1. +2. +3. + +## Environment + +- restic-manager server version: +- Agent version (if relevant): +- restic version on affected host: +- Host OS: +- How was the server installed: + +## Logs / output + +
Server log (sanitised) + +``` + +``` + +
+ +
Agent log (sanitised) + +``` +``` + +
+ +## Anything else + + diff --git a/.gitea/issue_template/feature_request.md b/.gitea/issue_template/feature_request.md new file mode 100644 index 0000000..5d0a297 --- /dev/null +++ b/.gitea/issue_template/feature_request.md @@ -0,0 +1,34 @@ +--- +name: Feature request +about: Suggest a new capability or change to existing behaviour +title: "[feature] " +labels: enhancement +--- + +## What you're trying to do + + + +## Why the current behaviour falls short + + + +## Proposed direction (optional) + + + +## Scope check + +- [ ] I've read [`spec.md`](../spec.md) §2 (Goals & Non-Goals). +- [ ] This isn't already on the roadmap in [`tasks.md`](../tasks.md). +- [ ] This fits the project's "small fleet, one person operating" + target rather than enterprise / multi-tenant / SaaS use cases. + +## Anything else + + diff --git a/.gitea/workflows/e2e.yml b/.gitea/workflows/e2e.yml new file mode 100644 index 0000000..39ad37f --- /dev/null +++ b/.gitea/workflows/e2e.yml @@ -0,0 +1,97 @@ +# P5-06 — End-to-end test suite. +# +# Spec : docs/superpowers/specs/2026-05-07-p5-oss-readiness-design.md +# Stack: e2e/compose.e2e.yml (server + agent + rest-server) +# Tests: e2e/playwright/tests/*.spec.ts +# +# Triggered on every PR into main and on workflow_dispatch. Runs +# longer than the unit-test workflow (~3-4 minutes for a clean run); +# kept separate so a slow e2e doesn't block the fast lint/test loop. + +name: e2e + +on: + pull_request: + branches: [main] + workflow_dispatch: + +jobs: + e2e: + name: Playwright vs docker-compose + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + + - name: Build the e2e stack + run: docker compose -f e2e/compose.e2e.yml build + + - name: Bring up the stack + run: docker compose -f e2e/compose.e2e.yml up -d server rest-server source-fixture + + - name: Wait for server health + run: | + set -eu + for i in $(seq 1 30); do + if curl -fsS http://127.0.0.1:8080/api/version >/dev/null 2>&1; then + echo "server up"; exit 0 + fi + sleep 2 + done + echo "server didn't come up"; docker compose -f e2e/compose.e2e.yml logs server; exit 1 + + - name: Capture bootstrap token from server logs + id: bootstrap + run: | + set -eu + for i in $(seq 1 15); do + line=$(docker compose -f e2e/compose.e2e.yml logs server 2>&1 | grep -E 'bootstrap token' -A2 | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1 || true) + if [ -n "$line" ]; then + echo "RM_BOOTSTRAP_TOKEN=$line" >> "$GITHUB_ENV" + echo "got bootstrap token (${#line} chars)" + exit 0 + fi + sleep 1 + done + echo "bootstrap token not found in logs" + docker compose -f e2e/compose.e2e.yml logs server + exit 1 + + - name: Start the agent + run: docker compose -f e2e/compose.e2e.yml up -d agent + + - uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install Playwright + working-directory: e2e/playwright + run: | + npm install --no-audit --no-fund + npx playwright install --with-deps chromium + + - name: Run Playwright tests + working-directory: e2e/playwright + env: + RM_BASE_URL: http://127.0.0.1:8080 + RM_BOOTSTRAP_TOKEN: ${{ env.RM_BOOTSTRAP_TOKEN }} + run: npx playwright test + + - name: Compose logs (on failure) + if: failure() + run: | + docker compose -f e2e/compose.e2e.yml logs --tail=200 server + docker compose -f e2e/compose.e2e.yml logs --tail=200 agent + docker compose -f e2e/compose.e2e.yml logs --tail=200 rest-server + + - name: Upload Playwright report (on failure) + if: failure() + uses: actions/upload-artifact@v3 + with: + name: playwright-report + path: e2e/playwright/playwright-report + retention-days: 7 + + - name: Tear down + if: always() + run: docker compose -f e2e/compose.e2e.yml down -v diff --git a/.gitignore b/.gitignore index 289d6ef..9e71078 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,10 @@ /bin/ /dist/ +# Generated mdBook output (source under docs/book/src is committed, +# the rendered book/ directory is not). +/docs/book/book/ + # Local data / runtime state /data/ /certs/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..14c1e21 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,69 @@ +# Code of Conduct + +restic-manager is a small project run by one person. This Code of +Conduct sets out the basic expectations for participating in the +project's issue tracker, pull requests, and any other community +spaces (chat, mailing lists) we may run in future. + +## Expected behaviour + +- **Be civil.** Disagreement is fine; rudeness is not. The same + comment can usually be made without making it personal. +- **Assume good faith.** People asking what feels like a basic + question may be new to the project. People proposing what feels + like a duplicate idea may not have seen the prior discussion. + Point them to the right place politely. +- **Stay on topic.** Issue threads are for the issue. Tangential + conversations belong in their own thread. +- **Acknowledge the project's scope.** restic-manager is + intentionally small in scope (see `spec.md` §2). Reasonable + feature suggestions may still be declined for fit reasons. + +## Unacceptable behaviour + +- Harassment, threats, or insults — public or private. +- Discriminatory comments based on age, body size, disability, + ethnicity, gender identity or expression, level of experience, + nationality, personal appearance, race, religion, sexual identity + or orientation. +- Sustained disruption — derailing threads, ignoring repeated + requests to take a discussion elsewhere, brigading. +- Publishing other people's private information without permission. + +## Reporting + +If someone in the project's spaces is behaving in a way that +breaches this Code of Conduct, contact the maintainer directly +through the contact details on their Gitea profile, or via the +private security disclosure path documented in +[SECURITY.md](./SECURITY.md). Reports stay confidential. + +The maintainer will review the report, gather context if needed, +and respond. Possible outcomes include a private warning, a public +clarification of expectations, a temporary or permanent ban from +project spaces, or no action if the report doesn't hold up. + +There is no formal appeals process — this is a one-person project, +not a foundation. If you think a decision was wrong you can say +so, in writing, to the maintainer; that's it. + +## Scope + +This Code of Conduct applies to interactions in any space the +project owns or operates: the Gitea repository (issues, pull +requests, discussions, wiki), any chat channels we publish, and +any conferences or events the project is officially represented at. + +It does not apply to: + +- Forks of the project that aren't being submitted back upstream. +- Conversations between contributors that don't reference the + project. +- Public criticism of the project itself. + +## Acknowledgement + +This document borrows shape and language from the +[Contributor Covenant](https://www.contributor-covenant.org/) v2.1 +but is intentionally shorter and adapted to the project's +single-maintainer reality. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ccc9d39..4e7647e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,30 +1,168 @@ -# Contributing +# Contributing to restic-manager -Thanks for your interest in contributing to restic-manager. +Thanks for your interest in restic-manager. This document covers how +to set up a development environment, the conventions the project +follows, and how patches make it from your machine into `main`. -> This is a placeholder. The project is in pre-alpha (Phase 1 / MVP). A -> full contributor guide will land alongside the Phase 5 OSS-readiness -> work — see [`tasks.md`](./tasks.md) P5-02. Until then the notes below -> apply. +## Project status and scope -## Before opening a PR +restic-manager is in pre-1.0. Core functionality (Phases 0–4) is +landed; OSS-readiness polish is in progress. The top of +[`tasks.md`](./tasks.md) tracks what's next; [`spec.md`](./spec.md) +is the canonical design doc and the source of truth for any +"why is it built this way" question. -1. Open an issue first for non-trivial changes — the design is still - moving (see [`spec.md`](./spec.md)) and unsolicited large PRs may - conflict with in-flight work. -2. `make lint test` should pass. -3. Match the existing code style — `gofumpt`, `goimports`, no comments - that just restate what the code does. -4. Keep commits focused; one logical change per commit. +The project is **single-maintainer, hobbyist-scale, and licensed +under [PolyForm Noncommercial 1.0.0](./LICENSE)**. That has two +practical implications: -## Reporting security issues +1. Big PRs without prior discussion may be declined for fit + reasons even when they're correct — opening an issue first lets + us check alignment cheaply. +2. Commercial use is not permitted by the license. Bug reports and + patches from operators of personal/community deployments are + very welcome. -Please do **not** open a public issue for security problems. A -`SECURITY.md` with a private disclosure path will be added in Phase 5 -(P5-05). Until then, contact the repository owner directly via the -contact details on their gitea profile. +## Getting started + +### Prerequisites + +- Go 1.25 or newer (`go.mod` is the source of truth) +- `make` +- For the front-end CSS bundle: nothing extra — `make build` + downloads a pinned `tailwindcss` standalone binary into `bin/`. +- For the docs site: nothing extra — `make docs` does the same trick + with `mdbook`. +- For end-to-end tests: Docker + Docker Compose, plus `npx` for + Playwright. + +### One-time setup + +```sh +git clone https://gitea.dcglab.co.uk/steve/restic-manager.git +cd restic-manager +make build # compiles bin/restic-manager-{server,agent} +make test # full unit + integration test sweep +make lint # gofumpt + goimports + golangci-lint +``` + +### Running locally + +For most development, the [smoke environment](./docs/e2e-smoke.md) +is the path of least resistance: + +```sh +make smoke-restart # rebuilds, launches as a systemd --user unit +make smoke-logs # tail of the server log +``` + +Then point a browser at `http://127.0.0.1:8080`. The first run +prints a one-time bootstrap token to the log; use it to create the +admin user. + +## Code conventions + +### Style + +- `gofumpt` for formatting; `goimports` for import grouping. + Both run via the pre-commit hook in this repo. +- `golangci-lint` with `.golangci.yml` defaults; CI rejects on lint + errors. +- UK English in identifiers, comments, log messages, and UI strings + (the misspell linter is configured for the UK locale — see + P3-X5 for the original sweep). +- Comments explain **why**, not what; avoid restating the code. + A surprising invariant or an external constraint is worth + writing down. "Adds 1 to x" is not. +- `slog` for structured logs. Never log secrets — and especially + never the merged-creds rest-server URL (see [`CLAUDE.md`](./CLAUDE.md)). + +### File and package layout + +- `cmd/server` and `cmd/agent` are the two binary entry points. +- `internal/` holds everything that's not part of the public Go + API (which is none of it — restic-manager isn't a library). +- Per-feature packages live under `internal/server/...` for the + control plane and `internal/agent/...` for the agent. +- `web/templates/` are HTML templates rendered with the standard + library; embedded via `web.FS`. + +### Tests + +- Unit tests live alongside the code as `*_test.go`. Use the + in-process sqlite store (`store.Open(":memory:")`) when you need + state — there is no test mock layer to maintain. +- HTTP handlers test through `httptest.NewServer` against the real + router; see `internal/server/http/auth_test.go` for the canonical + fixture pattern. +- End-to-end tests live in `e2e/` and run against a Docker Compose + stack. See [`docs/e2e.md`](./docs/e2e.md). + +### Database migrations + +- Migrations are hand-rolled SQL in `internal/store/migrations/` + and embedded via `embed.FS`. +- Prefer column-level `ALTER TABLE` over rebuilds — see + [`CLAUDE.md`](./CLAUDE.md) "Migrations" section for the FK-cascade + trap that bit migration 0007's first draft. + +## Workflow + +### Before opening a PR + +1. **Open an issue first** for non-trivial changes. The design is + still moving; an issue lets us agree on direction cheaply. +2. Run `make lint test` locally — both must pass. +3. Match existing code style (see above). +4. Keep commits focused: one logical change per commit. Imperative + subject lines, body explaining why if it isn't obvious. +5. Don't add `Co-Authored-By` trailers — repo policy. If you used + AI assistance in writing the patch, that's fine; we just don't + pollute every commit message with attribution boilerplate. + +### Pull requests + +PRs target `main`. CI runs lint + tests on Linux amd64/arm64 and +Windows amd64; all three must be green to merge. Squash-merge is +the default; the PR title becomes the merge-commit subject, so +keep it short and informative. + +The PR template asks for: + +- A short description of what changed and why. +- A test plan (commands run, scenarios verified). +- Anything reviewers need to know to assess the change (related + issue, follow-up work, deferred concerns). + +### Reporting bugs + +Open an issue with: + +- restic-manager version (`server --version`) and agent version. +- restic version on the affected host. +- Steps to reproduce. +- Server and agent logs (sanitise any tokens before pasting). + +Security-sensitive bugs go through the [SECURITY.md](./SECURITY.md) +disclosure path instead — please don't open a public issue for +them. + +### Suggesting features + +Open an issue describing the use case (not just the proposed +solution). The roadmap in `tasks.md` shows where the project is +heading; if the suggestion fits a future phase we'll wire it in +there. If it falls outside the project's scope (multi-tenancy, SaaS, +non-restic backends — see `spec.md` §2 non-goals) we'll say so +early to save your time. + +## Code of conduct + +Project participation is governed by [CODE_OF_CONDUCT.md](./CODE_OF_CONDUCT.md). +The short version: be civil; assume good faith; harassment is not +tolerated. ## License -By contributing you agree that your contributions are licensed under -the [PolyForm Noncommercial 1.0.0](./LICENSE) license. +By contributing you agree that your contributions are licensed +under the [PolyForm Noncommercial 1.0.0](./LICENSE) license. diff --git a/Makefile b/Makefile index 767a534..b258757 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,18 @@ TAILWIND_URL := https://github.com/tailwindlabs/tailwindcss/releases/downlo TAILWIND_INPUT := web/styles/input.css TAILWIND_OUTPUT := web/static/css/styles.css -.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch setup hooks smoke-restart smoke-stop smoke-status smoke-logs smoke-deploy +# mdBook for the docs site (P5-01). Single static binary, no +# Rust toolchain — same pattern as Tailwind. +MDBOOK_VERSION ?= v0.4.51 +MDBOOK_OS := $(shell uname -s | tr A-Z a-z) +MDBOOK_TRIPLE := $(shell uname -m)-unknown-$(if $(filter darwin,$(MDBOOK_OS)),apple-darwin,linux-gnu) +MDBOOK_BIN := $(BIN_DIR)/mdbook +MDBOOK_TARBALL := mdbook-$(MDBOOK_VERSION)-$(MDBOOK_TRIPLE).tar.gz +MDBOOK_URL := https://github.com/rust-lang/mdBook/releases/download/$(MDBOOK_VERSION)/$(MDBOOK_TARBALL) +DOCS_BOOK_DIR := docs/book +DOCS_BOOK_OUT := $(DOCS_BOOK_DIR)/book + +.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch docs docs-watch setup hooks smoke-restart smoke-stop smoke-status smoke-logs smoke-deploy # ---- smoke-env tooling ------------------------------------------------- # The smoke server runs as a transient user-systemd unit so it survives @@ -60,6 +71,18 @@ tailwind-watch: $(TAILWIND_BIN) ## Watch and rebuild on every save @mkdir -p $$(dirname $(TAILWIND_OUTPUT)) $(TAILWIND_BIN) -c tailwind.config.js -i $(TAILWIND_INPUT) -o $(TAILWIND_OUTPUT) --watch +$(MDBOOK_BIN): + @mkdir -p $(BIN_DIR) + @echo "==> downloading mdbook $(MDBOOK_VERSION) ($(MDBOOK_TRIPLE))" + curl -fsSL "$(MDBOOK_URL)" | tar -xz -C $(BIN_DIR) mdbook + @chmod +x $@ + +docs: $(MDBOOK_BIN) ## Build the docs/book/ mdBook site into docs/book/book/ + $(MDBOOK_BIN) build $(DOCS_BOOK_DIR) + +docs-watch: $(MDBOOK_BIN) ## Serve the docs site at http://127.0.0.1:3000 with live reload + $(MDBOOK_BIN) serve $(DOCS_BOOK_DIR) -n 127.0.0.1 -p 3000 + agent: ## Build the agent binary @mkdir -p $(BIN_DIR) CGO_ENABLED=0 go build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(AGENT_BIN) ./cmd/agent @@ -90,7 +113,7 @@ tidy: ## go mod tidy go mod tidy clean: ## Remove build artifacts - rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT) + rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT) $(DOCS_BOOK_OUT) run-server: server ## Build and run the server $(SERVER_BIN) diff --git a/README.md b/README.md index 56419ed..b421d6d 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,62 @@ # restic-manager Self-hosted, browser-based, single-pane-of-glass for managing -[restic](https://restic.net) backups across a fleet of Linux and Windows -endpoints. +[restic](https://restic.net) backups across a fleet of Linux and +Windows endpoints. -> Status: pre-alpha. Phase 0 (project bootstrap) complete; Phase 1 (MVP) in -> progress. See [`spec.md`](./spec.md) for the design and -> [`tasks.md`](./tasks.md) for the roadmap. +> **Status:** pre-1.0, feature-complete for the original use +> case. Phases 0–4 + 6 are landed (MVP, scheduling, restore, +> RBAC + OIDC, observability); Phase 5 (OSS readiness — docs site, +> contributor onboarding, end-to-end CI) is in flight. See +> [`spec.md`](./spec.md) for the design and [`tasks.md`](./tasks.md) +> for the live roadmap. -## What it does (target) +## What it does -- Central visibility into backup state for every endpoint -- Trigger any restic operation remotely (`backup`, `forget`, `prune`, - `check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`) -- Manage per-host backup schedules from the UI -- Live job progress streamed back to the UI -- Restore wizard (browse snapshots, pick paths, restore to original or - alternate host) -- Repo health surfacing (size, dedup ratio, last check, lock state) -- Alerting on failure or staleness -- Cross-platform agent (Linux + Windows) -- Ransomware-resistant repo access via append-only credentials +- Central visibility into backup state for every endpoint. +- Trigger any restic operation remotely (`backup`, `forget`, + `prune`, `check`, `unlock`, `snapshots`, `stats`, `diff`, + `restore`). +- Per-host schedules with named source groups + retention. +- Live job log streamed to the browser; downloadable as + text/NDJSON afterwards. +- Restore wizard: browse a snapshot's tree, pick paths, restore + in-place or to a new directory. +- Repo health surfacing (size, raw size, last check, lock state), + plus a 30/90-day repo-size trend. +- Alerting over webhook, ntfy, or SMTP. +- Cross-platform agent (Linux systemd + Windows SCM). +- Append-only-friendly: separate admin credential for prune. +- Optional Prometheus `/metrics` endpoint + sample Grafana + dashboard. +- Optional OIDC SSO (Authelia, Authentik, etc.). -## Architecture (one-line summary) +## Screenshots -A small Go control-plane on the Proxmox host, lightweight Go agents on each -endpoint that hold an outbound WebSocket to the control-plane, and a -`restic/rest-server` on Unraid that holds the actual backup data. The -control-plane never touches backup bytes. +| Sign in | Empty dashboard | Add host | +|:-------:|:---------------:|:--------:| +| ![Sign in](docs/screenshots/01-login.png) | ![Dashboard, fresh](docs/screenshots/02-dashboard-empty.png) | ![Add host](docs/screenshots/03-add-host.png) | + +| Alerts | Settings | Audit log | +|:------:|:--------:|:---------:| +| ![Alerts](docs/screenshots/04-alerts.png) | ![Settings](docs/screenshots/05-settings.png) | ![Audit log](docs/screenshots/06-audit.png) | + +(Screenshots from a fresh smoke install with no hosts. A populated +fleet view and the live-log + restore wizard surfaces are part of +the docs site under [`docs/book/`](./docs/book) — `make docs` to +render locally.) + +## Architecture (one-line) + +A small Go control-plane in Docker, lightweight Go agents on each +endpoint holding an outbound WebSocket to the control-plane, and +a restic repository (rest-server, S3, B2, SFTP — anything restic +speaks) that holds the actual backup data. **The control-plane +never touches backup bytes.** Full architecture diagram and component breakdown: -[`spec.md` §3](./spec.md). +[`spec.md` §3](./spec.md), or the rendered version in the +[docs site](./docs/book/src/concepts/architecture.md). ## Repository layout @@ -38,31 +64,63 @@ Full architecture diagram and component breakdown: cmd/server/ control-plane binary cmd/agent/ endpoint agent binary internal/api shared API types (REST + WS envelopes) -internal/server/ HTTP, WS, UI handlers +internal/server/ HTTP, WS, UI handlers, alert engine internal/agent/ service integration, restic runner, local scheduler internal/restic restic CLI wrapper internal/store SQLite persistence -internal/crypto secret encryption +internal/crypto secret encryption (AEAD) internal/auth passwords, sessions, agent tokens web/ server-rendered templates + static assets -deploy/ Dockerfile, docker-compose.yml, install scripts -design/ UI wireframes (Phase 0 design pass) +deploy/ Dockerfile, docker-compose.yml, install scripts, Grafana dashboard +docs/ prose docs + the mdBook site under docs/book +e2e/ compose stack + Playwright tests for end-to-end CI ``` +## Quickstart + +The reference deployment is a single Docker container fronted by +your existing reverse proxy. See the [installation guide](docs/book/src/getting-started/install.md) +for the full path; the very short version: + +```sh +export RM_VERSION=v0.9.0 # pin a real tag +export RM_BASE_URL=https://restic.example.com +export RM_TRUSTED_PROXY=10.0.0.0/8 +docker compose -f deploy/docker-compose.yml up -d +``` + +The server prints a one-time bootstrap token to the log on first +start. POST it to `/api/bootstrap` (or open `/bootstrap` in a +browser) to create the admin user. + ## Local development -Requires Go 1.25+ (built and tested on 1.26). The floor is set by -`modernc.org/sqlite` v1.50. +Requires Go 1.25+. The floor is set by `modernc.org/sqlite` v1.50. ```sh make build # builds cmd/server and cmd/agent into ./bin make test # runs go test ./... make lint # runs golangci-lint -make run-server # runs the server (dev defaults) +make smoke-restart # systemd --user smoke server (see CLAUDE.md) +make docs # renders the mdBook site to docs/book/book/ ``` +End-to-end test harness against a Docker Compose stack with a +sibling Linux agent: see [`docs/e2e.md`](docs/e2e.md). Runs in CI +on every PR. + +## Documentation + +- **Concepts and operator guides**: [docs site](docs/book/src/intro.md), + rendered with `make docs`. +- **Reverse-proxy setup**: [docs/reverse-proxy.md](docs/reverse-proxy.md). +- **Prometheus + Grafana**: [docs/prometheus.md](docs/prometheus.md). +- **End-to-end test harness**: [docs/e2e.md](docs/e2e.md). +- **Security policy**: [SECURITY.md](SECURITY.md). +- **Contributing**: [CONTRIBUTING.md](CONTRIBUTING.md). + ## License -PolyForm Noncommercial 1.0.0 — see [`LICENSE`](./LICENSE). Free for personal, -hobby, research, educational, governmental, and other noncommercial use. -Commercial use requires a separate license. +[PolyForm Noncommercial 1.0.0](./LICENSE). Free for personal, +hobby, research, educational, governmental, and other noncommercial +use. Commercial use requires a separate license. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..41a8bf3 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,137 @@ +# Security policy + +restic-manager handles credentials that grant access to backup +repositories — losing them means an attacker can read or destroy a +fleet's backups. We take security reports seriously even at this +project's small scale. + +## Supported versions + +Pre-1.0, only the latest tagged release on `main` is supported. +Backporting fixes to older tags is not currently offered. + +| Version | Supported | +|--------------------|----------------| +| `main` HEAD | Yes | +| Latest released tag| Yes | +| Anything older | No | + +## Reporting a vulnerability + +**Please don't open a public issue for security problems.** + +Instead, use one of these private channels: + +1. **Gitea private message** to the repository owner. The + instance is at and the owner's + profile (`steve`) has direct-message contact set up. +2. **Email** to the address on the maintainer's Gitea profile. + Use a subject like `[SECURITY] restic-manager: ` + so it doesn't get lost. PGP optional — if you want to encrypt, + ask for a key first. + +If you don't get an acknowledgement within **3 working days**, +please escalate through the other channel — solo maintainers do +miss things, and the goal here is to fix the problem, not to +preserve protocol. + +### What to include + +- A description of the issue and the impact (what does an attacker + gain? confidentiality, integrity, availability?). +- Affected component (server, agent, install script, docs). +- Affected version (`restic-manager-server --version`). +- Reproduction steps if you have them. A working PoC is welcome + but not required — a credible threat model is enough. +- Whether you intend to publish a writeup, and any timing + preferences. + +### What we'll do + +1. Acknowledge receipt within 3 working days. +2. Confirm or refute the issue, and agree a rough severity (CVSS + or just "this is bad / this isn't"). Asking clarifying + questions is normal at this stage — please don't read it as + foot-dragging. +3. Develop a fix on a private branch, test it, and prepare a + release. +4. Coordinate disclosure timing with you. The default is **30 + days from confirmed report to public disclosure**, with a + patched release published before the disclosure date. Faster + if a workable PoC is already circulating; slower only by + mutual agreement. +5. Credit the reporter in the release notes (or omit the credit + if you'd rather stay anonymous — your choice). + +## Scope + +In scope: + +- The server binary (`cmd/server`) and any HTTP, WebSocket, or CLI + surface it exposes. +- The agent binary (`cmd/agent`) and the way it consumes commands + from the server. +- The install scripts (`deploy/install/install.sh`, `install.ps1`) + and the systemd unit shipped with them. +- The docker-compose reference deployment and the docker image we + publish. +- Any cryptographic primitive choice or implementation detail + (AEAD, token hashing, session handling, OIDC handshake). +- Documentation that, if followed, leads operators into an + insecure configuration. + +Out of scope (not because they aren't real problems, just not ones +this report channel can act on): + +- Vulnerabilities in restic itself — report those upstream at + . +- Vulnerabilities in third-party dependencies that haven't yet been + patched upstream — report upstream first. +- Issues that require pre-authenticated admin access on the control + plane (admins can already do everything; that's not a privilege + escalation, that's the design). +- DoS via resource exhaustion on a deployment without the + recommended reverse proxy / rate limiting in front (see + `docs/reverse-proxy.md`). +- Social-engineering scenarios that don't have a technical hook + into the project's own surfaces. + +## Threat model summary + +For context (longer version in [`spec.md`](./spec.md) §11): + +- The server is **HTTP-only**; TLS termination, ACME, HSTS, and + edge rate-limiting are the reverse proxy's job. +- Credentials are encrypted at rest with an AEAD key loaded from + `RM_SECRET_KEY_FILE`. The same key encrypts agent secrets that + travel to the agent over the WS channel. +- Agents authenticate with bearer tokens issued at enrolment and + hashed at rest. Compromise of the server DB does **not** leak + bearer tokens in plaintext, but does leak the hashes (which is + enough to log in *as* the agent until the operator revokes — + see [NS-01 / NS-02](./tasks.md) for the revoke + regenerate + flows). +- The control plane intentionally **never touches backup bytes** — + the agent runs `restic` directly against the repo. A + compromised control plane can dispatch new jobs but cannot + exfiltrate snapshot contents in-band. +- Append-only credentials are first-class. Forget/prune jobs use a + separate, admin-marked credential that the server only pushes + for the duration of a maintenance dispatch. + +## Hardening checklist for operators + +- Run behind a TLS-terminating reverse proxy (Caddy/nginx/Traefik). +- Set `RM_TRUSTED_PROXY` to the proxy's CIDR so request IPs aren't + spoofable. +- Back up `RM_SECRET_KEY_FILE` separately from the database. + Without it the encrypted creds are unrecoverable. +- Use append-only credentials for the everyday backup path; only + the optional admin credential should have write/forget/prune + power. +- Disable users (don't delete) when staff change roles — bearer + tokens stay valid until rotated. +- Watch the alert and audit-log views during enrolment of new + hosts. + +Thanks for helping keep restic-manager users safe. diff --git a/docs/book/book.toml b/docs/book/book.toml new file mode 100644 index 0000000..a4ef953 --- /dev/null +++ b/docs/book/book.toml @@ -0,0 +1,19 @@ +[book] +title = "restic-manager" +description = "Self-hosted control plane for restic backups across a fleet of Linux and Windows endpoints." +authors = ["Steve Cliff"] +language = "en-GB" +multilingual = false +src = "src" + +[output.html] +default-theme = "ayu" +preferred-dark-theme = "ayu" +git-repository-url = "https://gitea.dcglab.co.uk/steve/restic-manager" +git-repository-icon = "fa-code-fork" +edit-url-template = "https://gitea.dcglab.co.uk/steve/restic-manager/_edit/main/docs/book/{path}" +no-section-label = false + +[output.html.fold] +enable = true +level = 2 diff --git a/docs/book/src/SUMMARY.md b/docs/book/src/SUMMARY.md new file mode 100644 index 0000000..558e792 --- /dev/null +++ b/docs/book/src/SUMMARY.md @@ -0,0 +1,40 @@ +# Summary + +[Introduction](./intro.md) + +# Getting started + +- [Installing the server](./getting-started/install.md) +- [Enrolling your first host](./getting-started/enrolling-hosts.md) +- [Running behind a reverse proxy](./getting-started/reverse-proxy.md) + +# Concepts + +- [Architecture](./concepts/architecture.md) +- [Credentials and how they flow](./concepts/credentials.md) +- [Schedules and source groups](./concepts/schedules-and-source-groups.md) +- [Repo maintenance](./concepts/repo-maintenance.md) + +# Operations + +- [Backups and restores](./operations/backups-and-restores.md) +- [Alerts and notifications](./operations/alerts.md) +- [Observability with Prometheus](./operations/observability.md) +- [Updating agents](./operations/updates.md) + +# Security + +- [Threat model](./security/threat-model.md) +- [Hardening checklist](./security/hardening.md) +- [Reporting vulnerabilities](./security/disclosure.md) + +# Reference + +- [Environment variables](./reference/env-vars.md) +- [HTTP endpoints](./reference/http-endpoints.md) + +--- + +[Contributing](./contributing.md) +[Roadmap](./roadmap.md) +[License](./license.md) diff --git a/docs/book/src/concepts/architecture.md b/docs/book/src/concepts/architecture.md new file mode 100644 index 0000000..f1706da --- /dev/null +++ b/docs/book/src/concepts/architecture.md @@ -0,0 +1,121 @@ +# Architecture + +## Components + +``` +┌────────────────────────────────────────────────────────────┐ +│ Server (control plane, single process) │ +│ * chi-based HTTP API + HTMX server-rendered UI │ +│ * WebSocket hub for agent fan-out + browser fan-out │ +│ * SQLite store (modernc.org/sqlite, pure Go) │ +│ * AEAD encryption helpers │ +│ * Alert engine + notification hub │ +└────────────┬───────────────────────────────────┬───────────┘ + │ outbound WS only │ HTTP(S) + │ │ +┌────────────▼─────────────┐ ┌────────────▼─────────────┐ +│ Agent (per host) │ │ Browser (operator) │ +│ * coder/websocket │ │ * htmx + a tiny bit │ +│ * cron for schedules │ │ of vanilla JS for │ +│ * restic wrapper │ │ live job updates │ +│ * sysinfo collector │ └──────────────────────────┘ +└────────────┬─────────────┘ + │ subprocess: restic ... + │ +┌────────────▼─────────────────────────────────────────────────┐ +│ restic repository (rest-server, S3, B2, SFTP, local …) │ +│ Backup data flows directly here. Server never touches it. │ +└──────────────────────────────────────────────────────────────┘ +``` + +## Why outbound-only WebSockets? + +The agent dials the server on `/ws/agent` with a bearer token. The +server doesn't initiate connections to the agent. Three reasons: + +1. **Firewall friendliness.** Nothing on the endpoint needs an + inbound port; this works behind the typical "branch office NAT" + without router config. +2. **Single auth point.** The bearer token is the only credential + that crosses the boundary; the agent never accepts an + incoming socket. +3. **Reconnect semantics are simpler.** When the connection drops + (NAT timeout, server restart, transient network glitch) the + agent backs off and re-dials; the server marks the host + offline after 90s and lets the alert engine raise a stale-host + alert. + +## Why SQLite? + +SQLite covers the project's HA non-goal: there isn't one. A small +control plane managing twelve endpoints does not need replication +or a separate database tier. SQLite gives us: + +- A single file to back up (plus the secret key). +- Hand-rolled migrations under `internal/store/migrations/` — + no migration framework lock-in. +- `WAL` mode plus per-connection foreign-key enforcement. + +The migrations file the entire schema; there's no ORM or +query-builder layer between Go code and SQL. + +## Why the agent runs `restic` itself, not via the server + +The control plane never holds backup bytes in flight. That's +deliberate: + +- A compromised control plane cannot exfiltrate snapshot + contents in-band — at worst it can dispatch new backup or + forget jobs (audit-logged) but the data path is between the + agent and the repository. +- The same agent process can target whichever transport restic + natively supports (rest-server, S3, B2, SFTP, local), no + separate mux on the server side. + +## Job lifecycle + +``` + ┌──────────────────────┐ +operator → │ POST /hosts/{id}/ │ + │ run-backup │ + └──────────┬───────────┘ + │ 1. INSERT INTO jobs (status='queued') + │ 2. dispatch command.run over WS + ▼ + ┌──────────────────────┐ + │ Agent dispatches │ + │ restic subprocess │ + └──────────┬───────────┘ + │ + │ 3. job.started ───▶ store.MarkJobStarted + │ 4. job.progress ───▶ JobHub broadcast (live UI) + │ 5. log.stream ───▶ append to job_logs + │ 6. job.finished ───▶ store.MarkJobFinished + │ + alert engine eval + │ + (P6) metrics histogram + ▼ + terminal: succeeded | failed | cancelled +``` + +Operators see live updates because the browser subscribes to +`/api/jobs/{id}/stream`, and the WS handler broadcasts each +agent-emitted envelope to all live subscribers in addition to +persisting it. + +## What scheduling looks like + +- The agent runs a local `robfig/cron/v3` instance. +- The server pushes the desired schedule set to the agent on + hello + after every CRUD change. +- When the agent's cron fires, it sends `schedule.fire` to the + server. The server creates a job row, sends `command.run` back, + and the agent dispatches a normal backup. +- If the WS drops between fire and run, the server queues the + schedule firing into `pending_runs` and drains on agent + reconnect — no missed scheduled backups due to network blips. + +For everything that isn't a backup (forget, prune, check), the +server runs a 60-second maintenance ticker against +`host_repo_maintenance` rows and dispatches the relevant command +when a cadence is due. The agent's local cron only handles +backups. diff --git a/docs/book/src/concepts/credentials.md b/docs/book/src/concepts/credentials.md new file mode 100644 index 0000000..58e1ed4 --- /dev/null +++ b/docs/book/src/concepts/credentials.md @@ -0,0 +1,98 @@ +# Credentials and how they flow + +restic-manager handles three credential surfaces: + +1. **Operator credentials** — the username + password (or OIDC + identity) that logs into the UI. +2. **Agent bearer tokens** — issued at enrolment, used by the + agent to authenticate its WebSocket to the server. +3. **Repo credentials** — the rest-server / S3 / B2 / SFTP + credentials the agent passes to `restic` itself. + +Each has a different threat model and storage strategy. + +## Operator credentials + +- Local users are stored in `users` with a bcrypt password hash. +- Sessions are random tokens minted at login, stored hashed in + the `sessions` table, expired after 24h. Cookie is HttpOnly, + SameSite=Lax, and Secure (when `RM_COOKIE_SECURE=true`, + default). +- OIDC users carry `auth_source='oidc'` and an `oidc_subject` + pinning their IdP identity. Local password login is rejected + for OIDC users. +- Disabling a user soft-deletes them via `disabled_at` — + pre-existing sessions are invalidated on the next request. + +## Agent bearer tokens + +- Minted at enrolment, hashed at rest with `auth.HashToken`. +- The plaintext token only exists in memory at enrolment time + and on the agent's filesystem (`/etc/restic-manager/agent.yaml`, + mode `0600`, owned by the service user). +- Compromise of the server DB leaks the hashes, which is enough + to *log in as that agent* until you revoke. Compromise of the + agent host leaks the plaintext (via the config file) — same + end result. +- Rotation: re-enrol the host. Today there's no in-place rotate; + the operator deletes the host (which cascades, including + revoking the bearer hash) and re-runs the install command. + +## Repo credentials + +This is the credential that ultimately matters for backup +integrity. restic-manager keeps two slots per host: + +- **The everyday credential** (`host_credentials.kind = ''`). + Append-only-friendly: this is the one your backup schedule + uses. It can write but not delete or forget. +- **The admin credential** (`host_credentials.kind = 'admin'`). + Has full delete rights. Only pushed to the agent transiently + while a `prune` or `forget` job is dispatching, and discarded + by the agent after the job ends. + +### Encryption flow + +1. Operator types the credential into the UI or the install form. +2. Server AEAD-encrypts the cred (`crypto.AEAD.Encrypt`) using the + key in `RM_SECRET_KEY_FILE`. The plaintext is dropped from + memory. +3. Encrypted blob is stored in `host_credentials.cred_blob`. +4. When the agent connects, the server decrypts the blob and + sends the **plaintext** down the WebSocket inside a + `config.update` envelope. +5. The agent stores the plaintext in its in-memory secrets store + for the lifetime of the process; it's reloaded fresh on every + server-side push. +6. When a job runs, the agent merges the credential into the + restic environment (`restic.Env.RepoURL` stays bare; the + `user:pass@…` form is built only inside `envSlice()` at the + moment of `exec.Command`). + +The merged form is **never logged**. The slog package's structured +output gets `restic.RedactURL()` for any URL it has cause to +mention. + +### Why push plaintext over the wire? + +The transport itself is the trust boundary: the WebSocket runs +inside the same TLS-terminated reverse-proxy connection your +browser uses, and the agent has already authenticated with its +bearer token. Re-encrypting the payload on top of that would just +move the key-management problem somewhere else. + +If your reverse proxy isn't TLS-terminated, the deployment is +already broken — see [Hardening](../security/hardening.md). + +## Setup tokens (admin-driven) + +When an admin creates a new user, the server mints a one-time +setup link valid for 1 hour. The hash is stored; the raw token +is shown to the admin once. The user opens the link, sets a +password, and is dropped into a session. Expired tokens are +swept on the alert engine's 60s tick. + +Same pattern for enrolment tokens: the raw token only exists in +memory at mint time, and the install snippet is the operator's +only chance to capture it. If you lose it, regenerate via the +**Add host** page (NS-02). diff --git a/docs/book/src/concepts/repo-maintenance.md b/docs/book/src/concepts/repo-maintenance.md new file mode 100644 index 0000000..d4a3995 --- /dev/null +++ b/docs/book/src/concepts/repo-maintenance.md @@ -0,0 +1,85 @@ +# Repo maintenance + +Backups go in; without maintenance, repos grow forever and +eventually fall over. restic-manager runs three maintenance +operations on a per-host cadence: + +| Command | What it does | Default cadence | +|----------|-------------------------------------------------------------|-----------------| +| `forget` | Marks snapshots eligible for removal per the retention policy attached to each source group. Cheap; runs append-only. | Daily after the last backup of the day | +| `prune` | Reclaims space from the repo. Requires the **admin** credential (write+delete). | Weekly, off-peak | +| `check` | Verifies repo integrity. Sub-options surface lock state. | Weekly, with `--read-data-subset N%` to sample pack files | + +A new field on each host row, `host_repo_maintenance`, holds the +cron expressions and last-fire anchors. The maintenance ticker on +the server runs every 60s, finds hosts whose next-fire is due, +and dispatches the right command. The agent's local cron is +**only** for backups. + +## Why server-side and not agent-side? + +The agent's cron knows about backups because backups are +per-source-group. Maintenance is per-repo, not per-source-group, +so doing it server-side keeps the per-host wiring simple: + +- One ticker, not N agent crons to keep in sync. +- Cancelling a maintenance dispatch is just "don't dispatch the + next one" — no agent-side state to clean up. +- Skipping offline hosts is trivial (no queue; only scheduled + *backups* queue into `pending_runs`). + +## Forget and the multi-group payload + +A single `forget` job can target several source groups at once. +The wire envelope (`ForgetGroups`) carries one entry per group, +each with its retention policy. The agent runs N +`restic forget --tag --keep-...` invocations in sequence, +streams their output, and reports a single terminal status. + +## Prune and the admin credential + +Prune mutates the repo. The everyday append-only credential +**cannot** prune — that's the whole point of append-only. +restic-manager keeps a second slot per host (`kind = 'admin'`) +for the credential that can. + +When a prune is dispatched (cadence-driven or operator-driven): + +1. Server pushes the admin credential to the agent in a fresh + `config.update`. +2. Agent runs `restic prune` with the merged credential. +3. Job finishes; agent discards the admin credential from its + in-memory secrets store. + +The server never logs the merged URL (see +[Credentials](./credentials.md)). + +## Check and lock state + +`restic check` warns about stale locks when it finds them. The +agent ships every check's output back as a `repo.stats` envelope +and a stream of log lines; if a stale lock is detected, the +**Repo** page surfaces a banner with an **Unlock** button. The +operator-only `unlock` command runs `restic unlock` and clears +the banner. + +`unlock` has no cadence — it's a manual action, never automatic. +Auto-unlocking would mask the cause (probably a previously +crashed long-running operation) and risk corrupting an +operation the operator has merely lost track of. + +## Repo stats + +After every backup, check, prune, and unlock, the agent runs +`restic stats --json --mode raw-data` and ships the result as a +`repo.stats` envelope. The server stores this in +`host_repo_stats` (latest only) and `host_repo_stats_history` +(one row per host per day, last-write-wins per column — a +prune-only patch never nulls a backup-time size). + +The host detail page surfaces: + +- Total size + raw size in the vitals strip. +- Last-check timestamp + colour-coded status. +- Last-prune timestamp. +- 30/90-day repo size trend chart. diff --git a/docs/book/src/concepts/schedules-and-source-groups.md b/docs/book/src/concepts/schedules-and-source-groups.md new file mode 100644 index 0000000..0a74bf5 --- /dev/null +++ b/docs/book/src/concepts/schedules-and-source-groups.md @@ -0,0 +1,105 @@ +# Schedules and source groups + +Two related but separable ideas: + +- A **source group** is a named bundle of "what to back up": + include paths, exclude patterns, retention policy, retry + configuration, optional pre/post hooks. The group's name is + used as the restic snapshot tag, so retention can target it + with `restic forget --tag `. +- A **schedule** is a cron expression that, when it fires, + triggers a backup of one or more source groups on a host. + +Decoupling them means you can have one schedule covering several +groups (e.g. `0 1 * * *` running both `system` and `data`), and +each group has its own retention without duplicating policy +across schedules. + +## Source group anatomy + +```yaml +name: data +includes: + - /var/lib/postgresql + - /home +excludes: + - /home/*/.cache + - /home/*/Downloads +retention: + keep_last: 7 + keep_daily: 14 + keep_weekly: 4 + keep_monthly: 6 +retry_max: 3 +retry_backoff_seconds: 600 +pre_hook: | + pg_dump -U postgres -F c -f /var/lib/postgresql/dumps/all.dump +post_hook: | + rm -f /var/lib/postgresql/dumps/all.dump +``` + +### Conflict detection + +If your retention policy says `keep_hourly: 24` but no schedule +points at this group sub-daily, the UI surfaces a +**conflict-dimension banner** ("`hourly` won't be honoured — +no schedule fires more often than once a day"). The flag is +stored on the source group (`conflict_dimension`) and refreshed +whenever a schedule or group changes. + +### Hooks + +`pre_hook` and `post_hook` run on the agent host inside +`/bin/sh -c` (`cmd.exe /C` on Windows). Output is streamed back +to the live job log as `hook(): …` lines. + +- A non-zero `pre_hook` exit aborts the backup. +- `post_hook` always runs, with `RM_JOB_STATUS=succeeded|failed` + in the environment. Use this for cleanup that must happen + whether the backup worked or not. +- Hooks only run for `kind=backup` jobs. They do not run for + `forget`, `prune`, `check`, etc. +- AEAD-encrypted at rest at the HTTP layer; the agent receives + plaintext over the WS channel. + +A "host default" pair of hooks lives on the host itself; a +source group's own hooks override them when set. + +## Schedule anatomy + +```yaml +cron: "0 2 * * *" +enabled: true +source_group_ids: + - + - +``` + +Slim by design: a schedule says **when** and **which groups**. +Everything else (paths, retention, hooks) lives on the groups. + +The agent's local cron fires the schedule. If the WebSocket is +down at fire time, the server queues the firing into +`pending_runs` and drains it on the next agent reconnect — a +short network blip won't lose the backup. + +### Last / next run + +The schedules tab shows "next" (computed by parsing the cron +expression with `robfig/cron/v3`) and "last" (the latest +`actor_kind=schedule` job in the `jobs` table) for every +schedule. The dashboard host row also surfaces `next 12h ago/from +now` when a single covering schedule is the run-now candidate. + +## Bandwidth limits + +Two places set restic's `--limit-upload` / `--limit-download`: + +1. **Host-wide caps** on the host row (`bandwidth_up_kbps`, + `bandwidth_down_kbps`). Pushed to the agent on hello and + after `PUT /api/hosts/{id}/bandwidth`. Apply to every restic + invocation on the host. +2. **Per-job overrides** on the per-source-group Run-now form. + Win over host caps for the lifetime of that one job. + +If neither is set, restic runs unthrottled. diff --git a/docs/book/src/contributing.md b/docs/book/src/contributing.md new file mode 100644 index 0000000..67f0b16 --- /dev/null +++ b/docs/book/src/contributing.md @@ -0,0 +1,17 @@ +# Contributing + +Full contributor guide: +[`CONTRIBUTING.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CONTRIBUTING.md) +in the repository root. + +The short version: + +- Open an issue first for non-trivial changes; the design is + still moving and unsolicited large PRs may conflict with + in-flight work. +- `make lint test` must pass. +- One logical change per commit, no `Co-Authored-By` trailers. +- UK English in identifiers and comments; comments explain the + **why** not the **what**. + +Code of conduct: [`CODE_OF_CONDUCT.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CODE_OF_CONDUCT.md). diff --git a/docs/book/src/getting-started/enrolling-hosts.md b/docs/book/src/getting-started/enrolling-hosts.md new file mode 100644 index 0000000..8889d0b --- /dev/null +++ b/docs/book/src/getting-started/enrolling-hosts.md @@ -0,0 +1,113 @@ +# Enrolling your first host + +The control plane only knows about hosts you've explicitly +enrolled. Two paths exist: + +1. **Token-based enrolment** — admin generates a token, pastes it + into an install command on the host. The host appears immediately, + already mapped to the desired repo. +2. **Announce-and-approve** — the agent runs without a token, + "announces" itself to the server, and a human in the UI accepts + the announcement. + +Token-based is the default and what most operators want; the +announce flow exists for the case where you can't easily paste a +secret onto the host (auto-imaged endpoints, scripted bring-ups +from a config repo). + +## Token-based enrolment + +### From the UI + +1. Click **+ Add host** on the dashboard. +2. Fill in the hostname, the restic repo URL, and the repo + credentials. The credentials are AEAD-encrypted at the server + immediately; what you paste is what the agent receives. +3. Optionally pick the initial source paths — these become the + first source group on the host. +4. Submit. The server mints a one-time token and shows you a copy- + pasteable install snippet. + +### On the host (Linux) + +```sh +curl -fsSL https://restic.example.com/install/install.sh | \ + sudo RM_SERVER=https://restic.example.com \ + RM_ENROL_TOKEN= \ + bash +``` + +The script: + +1. Detects architecture (`amd64` or `arm64`). +2. Downloads the agent binary from `/agent/binary?os=…&arch=…`. +3. Drops the systemd unit at + `/etc/systemd/system/restic-manager-agent.service`. +4. Runs the agent in `-enrol` mode, which posts the token and + stores the persistent bearer it gets back. +5. Enables and starts the unit. + +Within seconds the host should appear on the dashboard as +**online**. + +### On the host (Windows) + +```pwsh +$env:RM_SERVER = "https://restic.example.com" +$env:RM_ENROL_TOKEN = "" +iwr -useb $env:RM_SERVER/install/install.ps1 | iex +``` + +Equivalent shape: registers a Windows service via the SCM +(see P2-16 for details), runs `-enrol`, starts the service. + +## Recovering a lost token + +Tokens are single-use and short-lived (1h). If you closed the tab +before pasting the install command, head to the **Add host** page — +outstanding tokens are listed there with a **Regenerate** button. +Regenerating revokes the old token's hash and mints a fresh raw +token while preserving the original repo credentials and initial +paths. (NS-02 in `tasks.md` if you want the design rationale.) + +## Announce-and-approve + +If the host can reach the server but you don't want to paste a +secret on it, run the agent in `-announce` mode: + +```sh +restic-manager-agent -announce \ + -server https://restic.example.com \ + -hostname myhost +``` + +The host appears in the **Pending hosts** panel on the dashboard +with its hostname, OS, arch, and the source IP that announced it. +Click **Accept**, fill in the repo URL + credentials, and the +server pushes the bearer over the still-open WebSocket. No +back-and-forth round trip. + +If you don't accept within an hour the announcement is swept. + +## What happens on the agent + +After enrolment, the agent: + +1. Connects via WebSocket to `/ws/agent` with its bearer token. +2. Sends a `hello` envelope with its OS, arch, agent version, + restic version, and protocol version. +3. Receives a `config.update` carrying its encrypted repo + credentials and any source-group paths. +4. Sits idle, sending a heartbeat every 30s. Operator-driven + "Run now" actions arrive as `command.run` envelopes; scheduled + jobs are driven by the agent's local cron. + +## Auto-init of the repository + +The first time a backup runs, the agent invokes `restic init` +against the repo you configured at enrolment. If the repo already +exists (`config file already exists`) the agent treats it as a +success and proceeds. The host's repo status (`unknown` → +`ready` / `init_failed`) is surfaced under the vitals strip on +the host detail page; if init fails, save fresh credentials in +the **Repo** tab to retry. diff --git a/docs/book/src/getting-started/install.md b/docs/book/src/getting-started/install.md new file mode 100644 index 0000000..106107b --- /dev/null +++ b/docs/book/src/getting-started/install.md @@ -0,0 +1,92 @@ +# Installing the server + +The reference deployment is a single Docker container fronted by +your existing reverse proxy. The image bundles the server binary, +the cross-compiled agent binaries, and the install scripts. + +## Prerequisites + +- A Linux host with Docker and Docker Compose. +- A reverse proxy in front (Caddy, nginx, Traefik) terminating + TLS on a public hostname. The server itself is HTTP-only by + design — see [Reverse proxy](./reverse-proxy.md) for why. +- A persistent volume for the server's data directory. + +## Quick start + +The reference compose file lives at +[`deploy/docker-compose.yml`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/docker-compose.yml): + +```yaml +services: + restic-manager: + image: gitea.dcglab.co.uk/steve/restic-manager:${RM_VERSION:-latest} + restart: unless-stopped + environment: + RM_LISTEN: ":8080" + RM_DATA_DIR: "/data" + RM_BASE_URL: "https://restic.example.com" + # Trust your reverse proxy's CIDR so X-Forwarded-* are honoured. + RM_TRUSTED_PROXY: "10.0.0.0/8" + volumes: + - rm-data:/data + ports: + # Bind localhost only — your reverse proxy is the public face. + - "127.0.0.1:8080:8080" + +volumes: + rm-data: +``` + +Bring it up: + +```sh +docker compose up -d +docker compose logs -f restic-manager +``` + +The first run prints a one-time **bootstrap token** to the log. Use +it within an hour or it expires; if you miss the window the +container print it again on next start as long as no admin user +exists. + +## First-run admin setup + +Open `https://restic.example.com/bootstrap` (or whatever your +public URL is). Paste the bootstrap token, pick a username and a +password (≥ 12 characters), and submit. You'll land in the +dashboard logged in as the new admin. + +If you'd rather curl it, the equivalent is: + +```sh +curl -X POST https://restic.example.com/api/bootstrap \ + -H 'Content-Type: application/json' \ + -d '{"token":"","username":"admin","password":"<≥12 chars>"}' +``` + +## Backing up the secret key + +Inside the data volume, `secret.key` holds the AEAD key used to +encrypt every credential at rest. **Back it up separately from +the database.** Without it, encrypted credentials in the database +are unrecoverable; you'd have to re-enrol every host. + +A simple working approach: copy `secret.key` to your password +manager or to a separately-backed-up secrets vault the day you +install. It doesn't change. + +## Updating the server + +```sh +# Pin a new version in your compose file (.env or docker-compose.yml), +# then: +docker compose pull +docker compose up -d +``` + +Migrations run automatically on startup; the server will refuse to +start if a migration fails (better to bail than to half-migrate). + +For the agent self-update story, see +[Updating agents](../operations/updates.md). diff --git a/docs/book/src/getting-started/reverse-proxy.md b/docs/book/src/getting-started/reverse-proxy.md new file mode 100644 index 0000000..e0f55a4 --- /dev/null +++ b/docs/book/src/getting-started/reverse-proxy.md @@ -0,0 +1,95 @@ +# Running behind a reverse proxy + +The restic-manager server is HTTP-only by design. TLS termination, +public hostname, ACME, HSTS, and edge-level rate limiting all +belong to a reverse proxy you already operate outside this project. + +## What the proxy must forward + +The server reads four headers when (and only when) the immediate +peer matches `RM_TRUSTED_PROXY`: + +| Header | Value | Why | +|------------------------|----------------------------------------------------|-----| +| `X-Forwarded-For` | The original client IP | Rate-limit keys, audit log entries, OIDC redirect-URI checks. | +| `X-Forwarded-Proto` | `https` | Used for absolute URLs (e.g. OIDC redirect URIs). | +| `Host` | The public hostname clients use | Cookies are scoped to this; `RM_BASE_URL` must match. | +| `Connection` / `Upgrade` | Pass through unchanged | `/ws/agent` and `/api/jobs/{id}/stream` are WebSockets; without `Upgrade: websocket` they fail. | + +Set `RM_TRUSTED_PROXY` to the CIDR (or comma-separated list of +CIDRs) the proxy connects from. Anything outside that range has +its `X-Forwarded-*` headers ignored, so a stray request that +bypasses the proxy can't spoof the client IP. + +## Caddy + +```caddyfile +restic.example.com { + encode zstd gzip + reverse_proxy 127.0.0.1:8080 { + header_up X-Real-IP {remote_host} + } +} +``` + +Caddy adds `X-Forwarded-For` / `X-Forwarded-Proto` automatically +and passes WebSocket headers through by default, so this is the +whole config. + +## nginx + +```nginx +server { + listen 443 ssl http2; + server_name restic.example.com; + + ssl_certificate /etc/letsencrypt/live/restic.example.com/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/restic.example.com/privkey.pem; + + location / { + proxy_pass http://127.0.0.1:8080; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto https; + + # WebSocket upgrade + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + + # Long-lived agent WS — disable read timeout for this surface. + proxy_read_timeout 86400s; + } +} +``` + +## Traefik + +```yaml +http: + routers: + restic-manager: + rule: "Host(`restic.example.com`)" + entryPoints: [websecure] + tls: + certResolver: letsencrypt + service: restic-manager + + services: + restic-manager: + loadBalancer: + servers: + - url: "http://restic-manager:8080" + passHostHeader: true +``` + +Traefik forwards WebSocket upgrades and the standard +`X-Forwarded-*` set out of the box. + +## Verification + +After bringing the proxy up, the audit log should show your real +client IP for an interactive login (not the proxy's local +address). If you see `127.0.0.1` or the proxy's container IP, your +`RM_TRUSTED_PROXY` is wrong or `X-Forwarded-For` isn't being +forwarded. diff --git a/docs/book/src/intro.md b/docs/book/src/intro.md new file mode 100644 index 0000000..5f265a9 --- /dev/null +++ b/docs/book/src/intro.md @@ -0,0 +1,86 @@ +# restic-manager + +restic-manager is a self-hosted, browser-based, single-pane-of-glass +for managing [restic](https://restic.net) backups across a fleet of +Linux and Windows endpoints. It's designed for **small fleets** — +the original target was twelve endpoints — and **one operator**. + +## What it does + +- Centralised view of every endpoint's last backup, repo size, + snapshot count, and recent jobs. +- Trigger any restic operation remotely (`backup`, `forget`, `prune`, + `check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`). +- Per-host backup schedules with source groups (named bundles of + paths + retention policy). +- Live job log streamed to the browser; downloadable as text or NDJSON. +- Restore wizard with snapshot tree browse + path selection. +- Repo-level health surfacing (size, raw size, last-check, lock + state) plus a 30/90-day size trend. +- Alerting over webhook, ntfy, or SMTP. +- Cross-platform agent (Linux + Windows). +- Append-only-credential-friendly with a separate admin credential + for forget/prune. + +## What it isn't + +- **Not a SaaS.** Single-instance, single-tenant, by design. +- **Not a replacement for restic** — it's a control plane. The agent + shells out to a real `restic` binary. +- **Not highly available.** SQLite, single process; if you need + HA backups, you're shopping in the wrong aisle. +- **Not a multi-protocol backup tool.** restic only. + +## How it fits together + +``` +┌──────────────────────────────────────────────┐ +│ Server (control plane, Docker) │ +│ - REST + WebSocket API │ +│ - SQLite store │ +│ - Embedded HTMX UI │ +└──────────┬─────────────────────────┬─────────┘ + │ outbound WS │ HTTP(S) + │ │ +┌──────────▼──────────┐ ┌──────────▼─────────┐ +│ Agent (per host) │ │ Browser (operator) │ +│ - restic wrapper │ └─────────────────────┘ +│ - cron for sched. │ +└──────────┬──────────┘ + │ restic +┌──────────▼──────────────────────────────────┐ +│ rest-server / S3 / SFTP / local repo │ +│ (the actual backup data — server never │ +│ touches it) │ +└─────────────────────────────────────────────┘ +``` + +The control plane is a Go binary that runs in Docker. Each endpoint +runs a small Go agent that holds an outbound WebSocket to the +control plane. Backup data flows directly between the agent and the +restic repository — the control plane never sees a snapshot byte. + +## Where to start + +- [Installing the server](./getting-started/install.md) walks + through the Docker-based reference deployment. +- [Enrolling your first host](./getting-started/enrolling-hosts.md) + covers the install scripts and the announce-and-approve flow. +- [Architecture](./concepts/architecture.md) is the right read if + you want to know why something is the way it is before running + the install. + +## Project status + +Pre-1.0 but feature-complete for the original use case. Phases +0–4 are landed (MVP, scheduling, restore, RBAC + OIDC); Phase 5 +(this docs site, contributor onboarding, end-to-end CI) is in +flight. See [`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md) +for the live roadmap and [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md) +for the canonical design doc. + +## License + +[PolyForm Noncommercial 1.0.0](https://polyformproject.org/licenses/noncommercial/1.0.0/). +Personal and community deployments welcome; commercial use +requires a separate license. diff --git a/docs/book/src/license.md b/docs/book/src/license.md new file mode 100644 index 0000000..a627198 --- /dev/null +++ b/docs/book/src/license.md @@ -0,0 +1,39 @@ +# License + +restic-manager is licensed under +[**PolyForm Noncommercial 1.0.0**](https://polyformproject.org/licenses/noncommercial/1.0.0/). +The full text lives at +[`LICENSE`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/LICENSE) +in the repository root. + +## What this means + +- **Personal, hobbyist, educational, charitable, and similar + noncommercial use** is fully permitted, including modification + and redistribution. +- **Commercial use is not permitted** without a separate + license. The maintainer is not currently offering one — if + you need commercial rights, open an issue to start the + conversation. +- The license is permissive about everything except commercial + use: you can fork, modify, deploy in your home/lab, and + contribute back. + +## Why this license + +The PolyForm Noncommercial license was chosen because: + +- It's a real, legal, plainly-worded license (not a custom + half-written variant). +- It permits the realistic uses for a hobby project (the + maintainer's homelab, a friend's fleet, a charity's IT + closet) without inviting commercial vendors to repackage + the work. +- It's compatible with the project staying small and + maintainable — the maintainer doesn't want to be on the hook + for SLA-grade commercial support. + +## Contributions + +By contributing, you agree your contributions are licensed +under the same PolyForm Noncommercial 1.0.0 license. diff --git a/docs/book/src/operations/alerts.md b/docs/book/src/operations/alerts.md new file mode 100644 index 0000000..cb73f8f --- /dev/null +++ b/docs/book/src/operations/alerts.md @@ -0,0 +1,73 @@ +# Alerts and notifications + +restic-manager raises alerts on conditions that need human +attention. The alert engine evaluates rules on a 60s tick and +on every job-finished / host-online event. + +## Built-in alert kinds + +| Kind | Trigger | Severity | +|---------------------|---------|----------| +| `backup_failed` | A backup job ends in `failed` or `cancelled` | warning | +| `forget_failed` | A forget job ends in `failed` | warning | +| `prune_failed` | A prune job ends in `failed` | critical | +| `check_failed` | A check job ends in `failed` | critical | +| `agent_offline` | A host has been offline more than 90s past its heartbeat cadence | warning | +| `stale_schedule` | A schedule's "last run" is more than 1.5 × its interval ago | warning | +| `update_failed` | An agent self-update returned a fail or didn't reconnect within 90s | warning | +| `fleet_update_halted`| The rolling fleet-update worker stopped on a failure | critical | + +Each alert has a `dedup_key` so re-firing the same condition +just bumps `last_seen_at` — the operator gets one row per +condition, not a thousand. + +## Lifecycle + +``` +raised ──acknowledge──▶ acknowledged ──resolve──▶ resolved + │ │ + └────────auto-resolve──────┘ + (e.g. agent_offline auto-resolves on agent_online) +``` + +- **Acknowledge** says "I've seen this, stop notifying about it". +- **Resolve** says "the underlying condition is gone". +- Some alerts auto-resolve when the condition clears + (`agent_offline` is the canonical example). + +## Notification channels + +Configure under **Settings → Notifications**. Each channel can +subscribe to all alerts or filter by severity. + +### Webhook + +Posts a JSON envelope to a URL of your choice. Useful for +piping into Slack via an Incoming Webhook URL or into your own +alerting tooling. + +### ntfy + +Pushes a plain-text alert to an [ntfy.sh](https://ntfy.sh/) +topic. Configure the topic URL; optional bearer token if you +self-host with auth. + +### SMTP + +Plain SMTP (with optional TLS). Configure host, port, +username, password, and the recipient list. + +## Test fire + +Each channel exposes a **Test fire** button that dispatches a +single synthetic alert through the channel without touching the +alert engine. Use this when you've added a channel and want to +verify connectivity before the next real failure happens. + +## What gets logged + +Every alert raise / acknowledge / resolve writes an audit log +entry. The audit log UI at **Settings → Audit log** filters by +user, action, target, and time range — useful for the +post-incident "who clicked acknowledge on the prune-failure +alert" question. diff --git a/docs/book/src/operations/backups-and-restores.md b/docs/book/src/operations/backups-and-restores.md new file mode 100644 index 0000000..31c1a29 --- /dev/null +++ b/docs/book/src/operations/backups-and-restores.md @@ -0,0 +1,73 @@ +# Backups and restores + +## Running a backup + +Three ways to trigger one: + +1. **Scheduled** — the agent's local cron fires at the time set + on the schedule. +2. **Run-now** — operator clicks **Run now** on the host detail + right rail. Posts to `/hosts/{id}/run-backup` (defaults to all + source groups) or to a per-group form for finer control. +3. **API** — `POST /api/hosts/{id}/jobs` with the appropriate + payload. Same audit + dispatch path. + +In every case the server creates a `jobs` row, broadcasts a +`command.run` to the host, and lands the operator on the live +job log page (HTMX `HX-Redirect`). + +## Cancelling a job + +Any running job — backup, forget, prune, restore, anything — +exposes a **Cancel** button on its detail page. The server +broadcasts `command.cancel`, and the agent kills the running +restic subprocess via context cancel: SIGTERM first, SIGKILL +after a 5s grace (`cmd.Cancel` + `cmd.WaitDelay`). On Windows the +SIGTERM step is replaced with `os.Kill` because Windows can't +deliver SIGTERM. Result: a cancelled job lands as `cancelled` +within a couple of hundred milliseconds. + +## Restore wizard + +Restoring a file or path goes through a four-step wizard at +`/hosts/{id}/restore`: + +1. **Pick a snapshot.** Search by id or by date; the page is + pre-populated when you launched the wizard from a snapshot row. +2. **Browse the snapshot tree.** Lazy-loaded children via the + `MsgTreeList` synchronous WS RPC; results are cached + per-wizard-session for 30 minutes. Pick the absolute paths + you want. +3. **Choose a target.** Either **In place** (overwrites the + live filesystem; requires you to type the hostname to + confirm) or **New directory** (default + `$HOME/rm-restore//`; agent expands `$HOME` / + `${HOME}` / `~/` and creates the directory chain). +4. **Review and submit.** Server mints a job, dispatches + `command.run` with a `RestorePayload`, and `HX-Redirect`s to + the live job log. + +`--no-ownership` is gated on restic ≥ 0.17 (the flag was added +in that release). Hosts running 0.16 don't get the flag and +restore as the running user instead. + +## Snapshot diff + +Two snapshot ids in the **Diff** form on the host detail page → +a `JobDiff` job that runs `restic diff `. Output streams +to the standard live job log. Useful when investigating a +suspiciously-sized backup. + +## Job log artefacts + +Every job's log is persisted in `job_logs` (one row per line), +not just streamed in-memory. That gives you: + +- A live view at `/jobs/{id}` while the job runs. +- Two download formats from the same page header dropdown: + - **txt** — one line per row, `HH:MM:SS.mmm TAG payload`. + - **ndjson** — one self-contained JSON object per line + (`{seq, ts, stream, payload}`), perfect for `jq`. + +Downloads work whether the job is running or finished — +the source is the DB, not the live socket. diff --git a/docs/book/src/operations/observability.md b/docs/book/src/operations/observability.md new file mode 100644 index 0000000..f660d06 --- /dev/null +++ b/docs/book/src/operations/observability.md @@ -0,0 +1,61 @@ +# Observability with Prometheus + +restic-manager can expose a Prometheus scrape endpoint at +`GET /metrics`. The endpoint is **opt-in** — without an explicit +auth gate it isn't even mounted, so a forgotten config can't +accidentally publish fleet state. + +The full reference lives at +[`docs/prometheus.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/docs/prometheus.md); +the short version follows. + +## Enable the endpoint + +Set at least one of: + +- `RM_METRICS_TOKEN` — `Authorization: Bearer ` required. +- `RM_METRICS_TRUSTED_CIDR` — restricts source IPs (comma-CIDR). + +Both ANDed when both set. Constant-time token compare; CIDR +honours `X-Forwarded-For` only when the immediate hop matches +`RM_TRUSTED_PROXY`. + +## Metrics emitted + +- **Server gauges**: `rm_hosts_total`, `rm_hosts_online`, + `rm_active_alerts{severity}`, `rm_build_info{...}`. +- **Per-host gauges**: `rm_host_agent_online`, + `rm_host_last_backup_timestamp_seconds`, + `rm_host_last_backup_success`, `rm_host_repo_size_bytes`, + `rm_host_snapshot_count`, `rm_host_open_alerts`, + `rm_host_repo_status`. +- **Histogram**: + `rm_job_duration_seconds{kind,status,le=…}` (buckets + `1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf`). + +In-memory histogram only. Prometheus persists the scrapes; if +you need durable history at hourly resolution that's +Prometheus's job. + +## Sample Grafana dashboard + +[`deploy/grafana/restic-manager-dashboard.json`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/grafana/restic-manager-dashboard.json) +imports through Grafana's **+ → Import → Upload JSON file**. +Six panels: + +1. Fleet status (online / total). +2. Open alerts by severity. +3. Backups failing on most-recent run. +4. Hosts table — last backup, repo size, snapshots, open alerts. +5. Repo size over time, one line per host. +6. Job-duration p95 over a 1h window per kind. + +## Alerting + +restic-manager already has a built-in alert engine +([Alerts](./alerts.md)). The dashboard intentionally doesn't +duplicate it as Prometheus alert rules. If you want +Prometheus-side alerts on top, write your own based on the +metrics above — `rm_host_last_backup_success == 0`, +`time() - rm_host_last_backup_timestamp_seconds > `, +or whatever suits your environment. diff --git a/docs/book/src/operations/updates.md b/docs/book/src/operations/updates.md new file mode 100644 index 0000000..3b571be --- /dev/null +++ b/docs/book/src/operations/updates.md @@ -0,0 +1,50 @@ +# Updating agents + +Server updates are a `docker compose pull && up -d` away. +Agents update via the control plane. + +## Single-host update + +Each host's detail page shows an **Update agent** button when +the agent's reported version is older than the server's. The +button: + +1. Dispatches a `command.update` to that host. +2. The agent fetches the appropriate binary from + `$RM_SERVER/agent/binary?os=…&arch=…` to + `.new`. +3. Copies the running binary to `.old` (one + revision back, in case rollback is needed). +4. Atomic-renames `.new` over the running binary. +5. Exits cleanly. systemd's `Restart=always` (or Windows SCM) + brings the process back on the new binary. + +A 90-second timer on the server side waits for a hello at the +target version and marks the update succeeded — or, if the +agent doesn't reconnect at the expected version in time, marks +the update **failed** and raises an `update_failed` alert. + +## Fleet update + +The admin-only **Settings → Fleet update** page drives a rolling +update across every host in the fleet: + +- One host at a time. +- Wait for hello-with-target-version (max 95s). +- On any host failing, **halt** the rollout, raise a + `fleet_update_halted` alert, leave the rest of the fleet on + the old version. No surprise mass-failures. + +You can cancel an in-progress fleet update; the worker stops +after the current host finishes. + +## TLS and corruption + +Updates rely on the reverse proxy's TLS to detect corruption in +transit. There's no separate sha256 verification step — we +chose the simpler model on the basis that the same TLS already +gates every other byte the server hands to the agent. + +If you'd like a separate signature step before applying updates, +that's a future-phase enhancement (see `tasks.md` Phase 6 +candidates). diff --git a/docs/book/src/reference/env-vars.md b/docs/book/src/reference/env-vars.md new file mode 100644 index 0000000..e193c95 --- /dev/null +++ b/docs/book/src/reference/env-vars.md @@ -0,0 +1,58 @@ +# Environment variables + +The server reads its configuration from environment variables +(canonical) with an optional YAML overlay. Env wins over YAML so +operators can tweak a single setting without rewriting the file. + +## Server + +| Variable | Default | Meaning | +|---------------------------|----------------------------------|---------| +| `RM_LISTEN` | `:8080` | TCP listener for the HTTP server. | +| `RM_DATA_DIR` | `/data` | Persistent state directory (SQLite, secret key, agent assets). | +| `RM_BASE_URL` | (none) | Public URL clients use; required for OIDC redirects + cookie scope. | +| `RM_SECRET_KEY_FILE` | `${RM_DATA_DIR}/secret.key` | Path to the AEAD key file. Auto-generated on first run. | +| `RM_COOKIE_SECURE` | `true` | Set `false` only for local HTTP testing. Controls `Secure` on session cookies. | +| `RM_TRUSTED_PROXY` | (none) | Comma-separated CIDRs trusted for `X-Forwarded-*`. | +| `RM_BUNDLED_ASSETS_DIR` | `/opt/restic-manager/dist` | Read-only path with bundled agent binaries + install scripts (the docker image bakes them here). | +| `RM_METRICS_TOKEN` | (off) | When set, `GET /metrics` requires `Authorization: Bearer `. | +| `RM_METRICS_TRUSTED_CIDR` | (off) | When set, `GET /metrics` restricts source IPs (comma-CIDR). | + +OIDC variables (all optional; empty issuer disables OIDC): + +| Variable | Meaning | +|--------------------------------|---------| +| `RM_OIDC_ISSUER` | OIDC discovery URL (e.g. `https://auth.example.com`). | +| `RM_OIDC_CLIENT_ID` | Client ID registered with the IdP. | +| `RM_OIDC_CLIENT_SECRET` | Client secret (or use `RM_OIDC_CLIENT_SECRET_FILE`). | +| `RM_OIDC_CLIENT_SECRET_FILE` | Path to a file holding the client secret. | +| `RM_OIDC_DISPLAY_NAME` | Button label on the login page (e.g. "Authelia"). | +| `RM_OIDC_ROLE_CLAIM` | Token claim that carries roles (default `groups`). | +| `RM_OIDC_ROLE_MAPPING` | `idp-group=role` entries, comma-separated (e.g. `rm-admin=admin,rm-ops=operator`). | +| `RM_OIDC_REDIRECT_URL` | Override for the redirect URL; defaults to `${RM_BASE_URL}/auth/oidc/callback`. | + +## Agent + +| Variable | Default | Meaning | +|----------------------|---------|---------| +| `RM_AGENT_CONFIG` | `/etc/restic-manager/agent.yaml` (Linux) | Config file path. | + +The agent's other settings live in the YAML file (server URL, +bearer token, optional cert pin). The install script writes that +file for you at enrolment. + +## Build-time + +The Makefile threads `-ldflags` from `git describe` into the +`internal/version` package so `--version` and the dashboard +footer show the right values: + +``` +-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION) +-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT) +``` + +If you build with `go build` directly (no Makefile), `Version` +falls back to `dev` and the agent-update comparison falls back +to "always equal". Source-build deployments can still run; they +just don't participate in the self-update flow. diff --git a/docs/book/src/reference/http-endpoints.md b/docs/book/src/reference/http-endpoints.md new file mode 100644 index 0000000..9866066 --- /dev/null +++ b/docs/book/src/reference/http-endpoints.md @@ -0,0 +1,82 @@ +# HTTP endpoints + +A non-exhaustive map of the surfaces the control plane exposes. +All `/api/*` routes return JSON; all other paths render HTML +(server-rendered with HTMX in the loop). + +The canonical wiring lives at +[`internal/server/http/server.go`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/internal/server/http/server.go); +when in doubt, read the routes block there. + +## Public (no auth) + +| Method | Path | Purpose | +|--------|----------------------------|---------| +| GET | `/healthz` | Liveness probe. Returns 204. | +| POST | `/api/auth/login` | Local-user login. JSON body: `{username, password}`. | +| POST | `/api/auth/logout` | Invalidate the session cookie. | +| POST | `/api/bootstrap` | First-run admin creation. Accepts the token printed at first start. | +| POST | `/api/agents/enroll` | Token-based agent enrolment. | +| POST | `/api/agents/announce` | Announce-and-approve agent enrolment. | +| GET | `/agent/binary?os=&arch=` | Serves the agent binary for the install scripts. | +| GET | `/install/*` | Serves the Linux + Windows install scripts and the systemd unit. | +| GET | `/api/version` | Build version + commit JSON. | +| GET | `/metrics` | Prometheus exposition (only when opted-in via `RM_METRICS_TOKEN` / `RM_METRICS_TRUSTED_CIDR`). | +| GET | `/login`, `/setup`, `/bootstrap` | UI pages. | + +## Authenticated (any role) + +| Method | Path | Purpose | +|--------|------------------------------------------|---------| +| GET | `/` | Dashboard. | +| GET | `/hosts/{id}` | Host detail. | +| GET | `/hosts/{id}/repo` | Repo tab. | +| GET | `/hosts/{id}/jobs` | Jobs tab. | +| GET | `/hosts/{id}/sources` | Source groups list. | +| GET | `/hosts/{id}/schedules` | Schedules list. | +| GET | `/jobs/{id}` | Live job log. | +| GET | `/api/hosts`, `/api/fleet/summary` | JSON list + summary. | +| GET | `/api/jobs/{id}/stream` | WebSocket subscription to a job's live log. | +| GET | `/api/jobs/{id}/log.{txt,ndjson}` | Persisted log download. | + +## Operator role and above + +| Method | Path | Purpose | +|--------|---------------------------------------|---------| +| POST | `/hosts/{id}/run-backup` | Run-now (HTMX form-post). | +| POST | `/hosts/{id}/sources/{gid}/run-now` | Per-source-group run-now. | +| POST | `/hosts/{id}/repo/{prune,check,unlock,reinit,probe}` | Maintenance actions. | +| POST | `/api/hosts/{id}/snapshots/diff` | Snapshot-diff job. | +| POST | `/hosts/{id}/restore` | Restore wizard submit. | +| POST | `/api/jobs/{id}/cancel` | Cancel a running job. | +| POST | `/hosts/{id}/tags` | Update host tags. | +| POST | `/hosts/{id}/sources` and friends | Source-group CRUD. | +| POST | `/hosts/{id}/schedules` and friends | Schedule CRUD. | +| POST | `/hosts/{id}/repo/credentials`, `/admin-credentials` | Credential update. | + +## Admin role only + +| Method | Path | Purpose | +|--------|---------------------------------------|---------| +| POST | `/hosts/new` | Mint enrolment token (Add host). | +| POST | `/hosts/{id}/delete` | Delete + cascade. | +| POST | `/hosts/{id}/update` | Dispatch a single agent update. | +| GET/POST | `/settings/users/...` | User management. | +| POST | `/settings/notifications/...` | Notification channel CRUD + test fire. | +| POST | `/settings/fleet-update/...` | Fleet-update worker. | + +## WebSocket + +| Path | Who connects | Auth | +|--------------------------------|--------------|------| +| `/ws/agent` | Agent | Bearer token issued at enrolment. | +| `/ws/agent/pending` | Agent (announce flow) | Pending-id query param. | +| `/api/jobs/{id}/stream` | Browser | Session cookie. | + +## RBAC enforcement + +Routes are grouped into chi route-groups by required role +(`viewer < operator < admin`); the `requireRole` middleware in +`internal/server/http/middleware.go` is the bouncer. Sessions +re-validate `disabled_at` on every request, so a disabled user's +cookie stops working immediately. diff --git a/docs/book/src/roadmap.md b/docs/book/src/roadmap.md new file mode 100644 index 0000000..c6fdb24 --- /dev/null +++ b/docs/book/src/roadmap.md @@ -0,0 +1,32 @@ +# Roadmap + +The live roadmap is in +[`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md). +Phases ship in order; items inside a phase ship as the +opportunity arises. + +## Status snapshot + +| Phase | Theme | Status | +|-------|--------------------------------------------------|--------| +| 0 | Project bootstrap | ✅ done | +| 1 | MVP: enrolment, visibility, on-demand backup | ✅ done | +| 2 | Scheduling, retention, repo operations | ✅ done | +| 3 | Restore, alerts, audit | ✅ done | +| 4 | RBAC, OIDC, host tags | ✅ done | +| 5 | OSS readiness | 🚧 in flight (this docs site is part of it) | +| 6 | Update delivery + observability polish | ✅ done | + +## What's not on the roadmap + +The non-goals list in [`spec.md` §2](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md): + +- Replacing restic itself or providing custom repo formats +- Managing non-restic backup tools +- Multi-tenancy / SaaS deployment +- High availability of the control plane (SQLite, single-instance) +- Mobile-native apps (responsive web only) + +If something there is critical to your use case, restic-manager +isn't the right tool. That's not a closed door — it's a +deliberate scope decision so the project stays maintainable. diff --git a/docs/book/src/security/disclosure.md b/docs/book/src/security/disclosure.md new file mode 100644 index 0000000..d03a04f --- /dev/null +++ b/docs/book/src/security/disclosure.md @@ -0,0 +1,35 @@ +# Reporting vulnerabilities + +The full disclosure policy lives in +[`SECURITY.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/SECURITY.md) +at the repo root. The short version: + +- **Don't open a public issue.** +- Send a Gitea private message to `steve` on + , or email the address on the + maintainer's profile, with a subject like + `[SECURITY] restic-manager: `. +- Expect an acknowledgement within 3 working days; escalate + through the other channel if you don't get one. +- Default disclosure window is **30 days from confirmed report + to public disclosure**, faster if a PoC is already + circulating, slower only by mutual agreement. + +## What to include + +A description of the issue and the impact, the affected +component (server / agent / install script / docs), the version, +and reproduction steps. A working PoC is welcome but not +required — a credible threat model is enough. + +## In scope vs. out of scope + +See the full policy. Quick highlights: + +- **In scope:** server, agent, install scripts, docker image, + docker-compose reference, crypto choices, docs that lead to + insecure configs. +- **Out of scope:** restic itself (report upstream), unpatched + third-party deps (report upstream first), pre-authenticated + admin abuse (admins are designed to have full power), DoS on + deployments without the recommended reverse proxy. diff --git a/docs/book/src/security/hardening.md b/docs/book/src/security/hardening.md new file mode 100644 index 0000000..52e3efc --- /dev/null +++ b/docs/book/src/security/hardening.md @@ -0,0 +1,72 @@ +# Hardening checklist + +A baseline for new deployments. Most of these are defaults; the +list is here to make audit easy. + +## Server + +- [ ] Reverse proxy in front, TLS terminating at the proxy + (Caddy/nginx/Traefik). +- [ ] `RM_TRUSTED_PROXY` set to the proxy's CIDR. +- [ ] `RM_BASE_URL` matches the public hostname and the cookie + scope you want. +- [ ] `RM_COOKIE_SECURE=true` (the default; only set `false` + for local HTTP testing). +- [ ] HTTP listener bound to **localhost** in the compose file, + not `0.0.0.0`. The reverse proxy is the only thing that + should reach it. +- [ ] `secret.key` backed up separately from the database. +- [ ] Bootstrap token consumed and the printed log line scrubbed + from any log archive. + +## Authentication + +- [ ] Admin user has a password ≥ 12 characters (the floor). +- [ ] OIDC enabled if you have an IdP — local password auth + stays as a break-glass. +- [ ] Disabled (not deleted) any users who change roles or leave + so their session is invalidated immediately. +- [ ] The last-admin guard isn't tripped — there's always at + least one enabled admin user. + +## Repo credentials + +- [ ] Append-only credential set as the everyday cred for every + host. +- [ ] Admin credential set only where prune cadence is enabled. +- [ ] No credentials reused across hosts. Each host should have + its own credential pair so a single host compromise has a + single blast radius. +- [ ] If using rest-server, `--append-only` flag is on for the + everyday user; the prune user is a separate identity. + +## Agent + +- [ ] Agent runs as `root` (Linux) or `LocalSystem` (Windows) + **only when** the source paths require it. Otherwise pin + a service user that has read access to what's backed up + and nothing else. +- [ ] systemd unit's sandboxing flags are intact + (`NoNewPrivileges`, `Protect*`, `MemoryDenyWriteExecute`). +- [ ] Agent's config file `/etc/restic-manager/agent.yaml` is + mode `0600` and owned by the service user. The bearer + token lives in there. + +## Operations + +- [ ] Alerts wired to a real channel (webhook into Slack, + ntfy topic, SMTP) — not just sitting in the UI. +- [ ] Test-fire each notification channel after configuring. +- [ ] Audit-log retention is long enough to cover the operator's + incident-response window. +- [ ] Prometheus endpoint, if enabled, gated by token AND CIDR + where practical (default is opt-in / off). + +## Recovery + +- [ ] A documented procedure for rotating a leaked agent bearer + (delete + re-enrol the host). +- [ ] A test-restore done at least once, end-to-end, before + relying on the system in anger. +- [ ] `secret.key` and the SQLite database covered by separate + backup paths so neither alone reconstitutes the other. diff --git a/docs/book/src/security/threat-model.md b/docs/book/src/security/threat-model.md new file mode 100644 index 0000000..8af091e --- /dev/null +++ b/docs/book/src/security/threat-model.md @@ -0,0 +1,110 @@ +# Threat model + +This page documents what restic-manager defends against, what it +doesn't, and the trust assumptions a deployment is making. The +canonical version lives in [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md) +§11; the summary here is shaped for operators rather than +implementers. + +## Trust boundaries + +``` +┌──────────────────────────────────────────┐ +│ TRUSTED zone │ +│ ┌─────────────┐ ┌──────────────┐ │ +│ │ Operator's │ │ Reverse │ │ +│ │ browser │◄──►│ proxy │ │ TLS terminates here +│ └─────────────┘ └──────┬───────┘ │ +└────────────────────────────┼─────────────┘ + │ HTTP, plaintext + │ (loopback or trusted LAN) +┌────────────────────────────▼─────────────┐ +│ Server (control plane) │ +└────────────┬─────────────────────────────┘ + │ outbound WebSocket (TLS to clients via proxy) + │ — bearer-authenticated +┌────────────▼──────────────┐ +│ Agent (per host) │ ◄── attacker model: assume one +└────────────┬──────────────┘ endpoint can be compromised + │ subprocess + ▼ + restic ──▶ repository (rest-server / S3 / SFTP / …) +``` + +## What we defend against + +### Network attacker between operator and server + +- HTTPS via the reverse proxy is the only operator-facing surface + on a sane deployment. +- `RM_COOKIE_SECURE=true` (default) means the session cookie + refuses to ride a non-HTTPS connection. +- `RM_TRUSTED_PROXY` gates whether `X-Forwarded-*` is honoured; + a bypassing request can't spoof the client IP. + +### Compromised agent host + +- The agent's bearer token can dispatch commands **only on its + own host**. It can't read other hosts' state, dispatch jobs + on other hosts, or escalate within the control plane. +- If you suspect a host compromise: + 1. Disable the agent's host row from **Hosts → Delete** + (cascades the bearer hash). + 2. Rotate the repo credential at the rest-server / object + store side. + 3. Audit-log lists every action that bearer ever drove. + +### DB compromise without the secret key + +- Repo credentials are AEAD-encrypted at rest. A DB dump alone + doesn't expose them. +- Agent bearer **hashes** are leaked; that's enough to + authenticate as any agent until you revoke. A rotation + procedure is just "delete + re-enrol" today. +- Operator passwords are bcrypt-hashed; OIDC users have no + password to leak. +- Session tokens are hashed; an attacker can't replay a + session from a DB dump. + +### DB compromise WITH the secret key + +The attacker can decrypt every credential. Treat +`secret.key` with the same care as a password manager database. +Back it up to a separate vault, not to the same Docker volume +as the database. + +### Forget/prune as a DoS vector + +- The everyday backup credential cannot prune (append-only). +- The admin credential is only pushed to the agent at the + moment of dispatch and discarded after the job ends. +- Compromise of a single agent host does **not** grant prune + rights — at worst the attacker gets fresh write access until + the credential is rotated. + +### Operator-side typo or bad copy-paste + +- Repo credentials are stored encrypted; mis-typed creds fail + fast on the next `restic` invocation rather than silently + corrupting state. +- NS-03 added auto-init: the first dispatched job after creds + change runs `restic init`, surfaces the error eagerly under + the host's vitals strip if the creds are bad, and resets the + host's `repo_status` so the operator can retry without + hunting through job logs. + +## What we don't defend against + +- **Insider threat at the maintainer level.** A malicious + maintainer can publish a backdoored container; SBOM / + signing infrastructure (Phase 6 candidate) would help here + but isn't shipped today. +- **Supply chain.** We pin module versions (`go.sum`) and + pin the Tailwind binary's release tag, but a compromise in + one of those upstreams would land here. +- **Side-channel via restic itself.** A bug in restic that + enables snapshot-content disclosure is restic's problem; the + control plane doesn't see snapshot bytes either way. +- **DoS via resource exhaustion** without the recommended + reverse-proxy / rate-limit in front. Don't expose the + server's HTTP port to the public internet directly. diff --git a/docs/e2e.md b/docs/e2e.md new file mode 100644 index 0000000..7d66739 --- /dev/null +++ b/docs/e2e.md @@ -0,0 +1,120 @@ +# End-to-end test harness + +The e2e harness stands up the full production-shaped stack +(server + agent + rest-server) in Docker Compose and drives it +through Playwright. CI runs it on every PR; operators can run it +locally too. + +## Files + +``` +e2e/ +├── compose.e2e.yml compose stack: server + rest-server + agent +├── Dockerfile.agent Linux container for the agent (alpine + restic) +├── agent-entrypoint.sh decides between announce / token-enrol / run +└── playwright/ + ├── package.json + ├── playwright.config.ts + └── tests/ + ├── lib/server.ts bootstrap, login, accept, poll helpers + └── smoke.spec.ts happy-path: enrol → backup → succeeded +``` + +## Local run + +Prerequisites: Docker + Docker Compose, and `npx` for Playwright. + +```sh +# 1. Build + bring up the stack (server, rest-server, source data). +docker compose -f e2e/compose.e2e.yml up --build -d server rest-server source-fixture + +# 2. Wait for the server, then scrape the bootstrap token from the log. +until curl -fsS http://127.0.0.1:8080/api/version >/dev/null; do sleep 1; done +RM_BOOTSTRAP_TOKEN=$(docker compose -f e2e/compose.e2e.yml logs server \ + | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1) +export RM_BOOTSTRAP_TOKEN + +# 3. Start the agent (it announces against the running server). +docker compose -f e2e/compose.e2e.yml up -d agent + +# 4. Install + run Playwright. +cd e2e/playwright +npm install +npx playwright install --with-deps chromium +npx playwright test +``` + +When the test passes you'll see: + +``` +Running 2 tests using 1 worker + ✓ smoke: enrol-via-announce → backup › happy path completes in under a minute (47s) + ✓ smoke: scrape /metrics › metrics endpoint exposes the host gauge (180ms) + + 2 passed (47.5s) +``` + +Tear-down: + +```sh +docker compose -f e2e/compose.e2e.yml down -v +``` + +`-v` removes the named volumes too — important between runs because +the rest-server volume holds an initialised repo and the +agent-config volume holds a stale bearer. + +## What the test exercises + +1. **Bootstrap.** Posts the admin-creation request to + `/api/bootstrap` with the token scraped from the server log. +2. **Login (UI).** Drives the login form via Playwright; verifies + the dashboard loads with a session cookie set. +3. **Pending host appears.** Polls the dashboard for the inline + accept form generated by the announcing agent; reads the + pending-id out of its action URL. +4. **Accept.** POSTs `/api/pending-hosts/{id}/accept` with the + rest-server URL + repo password. The server mints a Host row + + bearer + AEAD-encrypted creds and pushes the bearer down + the still-open pending WebSocket. +5. **Online + auto-init.** Polls `/api/hosts` until the new host + is `status=online`. Auto-init runs as part of this — the + first dispatched job after creds save is `restic init`. +6. **Run backup.** Submits the host detail page's `Run now` + form; expects `HX-Redirect` to the live job page. +7. **Verify.** Polls `/api/hosts` until the host's + `last_backup_status` flips to `succeeded`. +8. **Metrics.** Scrapes `/metrics` and asserts the + server-gauge + build-info lines are present (the compose + stack opens the endpoint via `RM_METRICS_TRUSTED_CIDR=0.0.0.0/0`). + +## CI workflow + +[`.gitea/workflows/e2e.yml`](../.gitea/workflows/e2e.yml) runs the +suite on every PR into `main`. On failure it dumps the last 200 +lines of each container log as a workflow annotation and uploads +the Playwright HTML report as an artefact. + +## When tests fail + +- **Pending host never appears.** Agent container probably + couldn't reach the server. Check `docker compose logs agent` + for connection errors and `docker compose logs server` for + any 4xx on `/api/agents/announce`. +- **Backup hangs in `running`.** The agent shells out to + `restic`; check the live job log at + `http://127.0.0.1:8080/jobs/` (still up after a + failed test as long as you didn't `down -v`). +- **`RM_BOOTSTRAP_TOKEN not set`.** The server log scrape + matched the wrong line or the token regex is too tight. The + server prints the token on a line starting with ` ` (four + spaces) inside a banner; widen the regex if your server log + format changes. + +## Adding new tests + +The harness is intentionally flat — one `*.spec.ts` per +scenario. Reuse the helpers in `lib/server.ts` and avoid +duplicating bootstrap / login boilerplate. Heavy fixtures +(custom users, OIDC IdP) belong in their own compose override +file rather than complicating `compose.e2e.yml`. diff --git a/docs/screenshots/01-login.png b/docs/screenshots/01-login.png new file mode 100644 index 0000000..30cffbf Binary files /dev/null and b/docs/screenshots/01-login.png differ diff --git a/docs/screenshots/02-dashboard-empty.png b/docs/screenshots/02-dashboard-empty.png new file mode 100644 index 0000000..828a206 Binary files /dev/null and b/docs/screenshots/02-dashboard-empty.png differ diff --git a/docs/screenshots/03-add-host.png b/docs/screenshots/03-add-host.png new file mode 100644 index 0000000..671d6fc Binary files /dev/null and b/docs/screenshots/03-add-host.png differ diff --git a/docs/screenshots/04-alerts.png b/docs/screenshots/04-alerts.png new file mode 100644 index 0000000..e413351 Binary files /dev/null and b/docs/screenshots/04-alerts.png differ diff --git a/docs/screenshots/05-settings.png b/docs/screenshots/05-settings.png new file mode 100644 index 0000000..6a1244a Binary files /dev/null and b/docs/screenshots/05-settings.png differ diff --git a/docs/screenshots/06-audit.png b/docs/screenshots/06-audit.png new file mode 100644 index 0000000..e6333f5 Binary files /dev/null and b/docs/screenshots/06-audit.png differ diff --git a/e2e/Dockerfile.agent b/e2e/Dockerfile.agent new file mode 100644 index 0000000..e699170 --- /dev/null +++ b/e2e/Dockerfile.agent @@ -0,0 +1,42 @@ +# Build a Linux container that runs the restic-manager agent against a +# sibling rest-server in the e2e compose stack. Used only by tests +# (e2e/compose.e2e.yml + .gitea/workflows/e2e.yml). +# +# Two stages: +# 1. golang:alpine to build the agent binary. +# 2. alpine:3.20 with the `restic` package + the built binary. +# +# Pinning by digest is intentional for CI reproducibility. + +FROM golang:1.25-alpine AS build +WORKDIR /src + +ENV CGO_ENABLED=0 \ + GOFLAGS="-trimpath" + +COPY go.mod go.sum* ./ +RUN go mod download + +COPY . . +ARG VERSION=e2e +RUN go build -ldflags="-s -w -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=${VERSION}" \ + -o /out/restic-manager-agent ./cmd/agent + +FROM alpine:3.20 +RUN apk add --no-cache restic ca-certificates curl +COPY --from=build /out/restic-manager-agent /usr/local/bin/restic-manager-agent + +# Agents normally run as root because backup paths often need it. The +# e2e fixture only backs up paths under /data which we own, so this +# container would tolerate a non-root user — but staying root keeps +# parity with the production install. +USER root + +# The agent needs a writable directory for its config + secrets store. +RUN mkdir -p /etc/restic-manager /var/lib/restic-manager-agent +ENV RM_AGENT_CONFIG=/etc/restic-manager/agent.yaml + +# The compose entrypoint sets the announce URL via env. +COPY e2e/agent-entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/e2e/agent-entrypoint.sh b/e2e/agent-entrypoint.sh new file mode 100755 index 0000000..7900a88 --- /dev/null +++ b/e2e/agent-entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/sh +# Entrypoint for the e2e agent container. +# +# Three states: +# 1. Already enrolled (agent.yaml has a bearer): run the agent. +# 2. Token supplied via $RM_ENROL_TOKEN: enrol then run. +# 3. Otherwise: announce against $RM_SERVER and wait for an admin to +# accept us. The announce flow blocks until accepted, then drops +# straight into the normal run loop, so this is the test-friendly +# path. +set -eu + +CFG="${RM_AGENT_CONFIG:-/etc/restic-manager/agent.yaml}" +SERVER="${RM_SERVER:?set RM_SERVER}" + +if [ -f "$CFG" ] && grep -q '^agent_token:' "$CFG"; then + exec restic-manager-agent -config "$CFG" +fi + +if [ -n "${RM_ENROL_TOKEN:-}" ]; then + exec restic-manager-agent -config "$CFG" \ + -enroll-server "$SERVER" \ + -enroll-token "$RM_ENROL_TOKEN" +fi + +# Announce-and-approve: blocks until an admin accepts, then runs. +exec restic-manager-agent -config "$CFG" -enroll-server "$SERVER" diff --git a/e2e/compose.e2e.yml b/e2e/compose.e2e.yml new file mode 100644 index 0000000..bb77b8d --- /dev/null +++ b/e2e/compose.e2e.yml @@ -0,0 +1,87 @@ +# End-to-end test stack — used by .gitea/workflows/e2e.yml and by +# operators who want to run the Playwright suite locally. +# +# Three services: +# * server — restic-manager built from the working tree +# * agent — restic-manager agent built from the working tree +# (announces; Playwright accepts it during the test) +# * rest-server — the actual restic backend, sibling of the agent +# +# Run from the repo root: +# docker compose -f e2e/compose.e2e.yml up --build --abort-on-container-exit + +services: + rest-server: + image: restic/rest-server:0.13.0 + environment: + DATA_DIR: /data + OPTIONS: "--no-auth" + volumes: + - rest-data:/data + networks: [rmnet] + + server: + build: + context: .. + dockerfile: deploy/Dockerfile.server + args: + VERSION: e2e + environment: + RM_LISTEN: ":8080" + RM_DATA_DIR: "/data" + RM_BASE_URL: "http://server:8080" + RM_COOKIE_SECURE: "false" + # Bind the metrics endpoint loose for the test, so one of the + # Playwright assertions can exercise it. + RM_METRICS_TRUSTED_CIDR: "0.0.0.0/0" + volumes: + - server-data:/data + ports: + - "127.0.0.1:8080:8080" + healthcheck: + test: ["CMD", "/usr/local/bin/restic-manager-server", "--version"] + interval: 2s + timeout: 2s + retries: 30 + networks: [rmnet] + + agent: + build: + context: .. + dockerfile: e2e/Dockerfile.agent + args: + VERSION: e2e + environment: + RM_SERVER: "http://server:8080" + depends_on: + - server + volumes: + # Source paths the agent backs up. Compose pre-populates this + # with a few files so the snapshot list isn't empty. + - source-data:/source + - agent-config:/etc/restic-manager + - agent-state:/var/lib/restic-manager-agent + networks: [rmnet] + + # One-shot init container that drops a couple of files into the + # source volume so backups have something to snapshot. + source-fixture: + image: alpine:3.20 + command: > + sh -c 'mkdir -p /source && echo "hello world" > /source/hello.txt && + echo "another file" > /source/two.txt && sleep 0.2' + volumes: + - source-data:/source + networks: [rmnet] + restart: "no" + +volumes: + server-data: + rest-data: + source-data: + agent-config: + agent-state: + +networks: + rmnet: + driver: bridge diff --git a/e2e/playwright/package.json b/e2e/playwright/package.json new file mode 100644 index 0000000..ed7afc3 --- /dev/null +++ b/e2e/playwright/package.json @@ -0,0 +1,14 @@ +{ + "name": "restic-manager-e2e", + "version": "0.0.0", + "private": true, + "type": "module", + "scripts": { + "test": "playwright test", + "test:headed": "playwright test --headed", + "test:debug": "PWDEBUG=1 playwright test" + }, + "devDependencies": { + "@playwright/test": "^1.50.0" + } +} diff --git a/e2e/playwright/playwright.config.ts b/e2e/playwright/playwright.config.ts new file mode 100644 index 0000000..d6dbc0d --- /dev/null +++ b/e2e/playwright/playwright.config.ts @@ -0,0 +1,31 @@ +import { defineConfig, devices } from '@playwright/test'; + +// Single-target Chromium config: the e2e suite is narrow (smoke +// the production-shaped flow against the docker-compose stack). +// Cross-browser matrix doesn't add signal — what we're verifying is +// the server's HTML and the agent's WebSocket handshake, neither of +// which depends on browser engine. + +const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080'; + +export default defineConfig({ + testDir: './tests', + timeout: 60_000, + expect: { timeout: 10_000 }, + fullyParallel: false, + retries: process.env.CI ? 1 : 0, + workers: 1, + reporter: [['list'], ['html', { open: 'never' }]], + use: { + baseURL, + trace: 'retain-on-failure', + screenshot: 'only-on-failure', + video: 'retain-on-failure', + }, + projects: [ + { + name: 'chromium', + use: { ...devices['Desktop Chrome'] }, + }, + ], +}); diff --git a/e2e/playwright/tests/lib/server.ts b/e2e/playwright/tests/lib/server.ts new file mode 100644 index 0000000..397a908 --- /dev/null +++ b/e2e/playwright/tests/lib/server.ts @@ -0,0 +1,114 @@ +// Helpers used by every test. The shape favours the JSON API for +// reads + accept/dispatch (deterministic, easy to assert) and the +// browser for human-facing surfaces (login form, dashboard render). + +import { APIRequestContext, expect, Page } from '@playwright/test'; + +export const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080'; + +export interface HostJSON { + id: string; + name: string; + status: string; + last_backup_status?: string; +} + +export async function readBootstrapToken(): Promise { + const tok = process.env.RM_BOOTSTRAP_TOKEN; + if (!tok) { + throw new Error('RM_BOOTSTRAP_TOKEN not set — the harness scrapes it from server logs'); + } + return tok; +} + +export async function bootstrapAdmin( + request: APIRequestContext, + { + username = 'admin', + password = 'e2e-test-password-1234', + }: { username?: string; password?: string } = {}, +): Promise<{ username: string; password: string }> { + const token = await readBootstrapToken(); + const res = await request.post(`${baseURL}/api/bootstrap`, { + data: { token, username, password }, + }); + if (!res.ok() && res.status() !== 409 /* already bootstrapped */) { + throw new Error(`bootstrap: ${res.status()} ${await res.text()}`); + } + return { username, password }; +} + +export async function loginViaUI(page: Page, username: string, password: string): Promise { + await page.goto(`${baseURL}/login`); + await page.locator('#login-username').fill(username); + await page.locator('#login-password').fill(password); + await Promise.all([ + page.waitForURL(new RegExp(`^${baseURL}/?$`)), + page.locator('form[action="/login"] button[type="submit"]').click(), + ]); +} + +/** + * Polls the dashboard until a pending host card is visible, then + * extracts its pending-id from the inline accept form's action URL. + */ +export async function waitForPendingHostID(page: Page): Promise { + const formLocator = page.locator('form[action^="/api/pending-hosts/"][action$="/accept"]').first(); + await expect(formLocator).toBeVisible({ timeout: 60_000 }); + const action = await formLocator.getAttribute('action'); + if (!action) throw new Error('pending host form has no action attribute'); + const m = action.match(/\/api\/pending-hosts\/([^/]+)\/accept/); + if (!m) throw new Error(`unexpected action URL: ${action}`); + return m[1]; +} + +export async function acceptPending( + request: APIRequestContext, + cookie: string, + pendingID: string, + repo: { url: string; username?: string; password: string }, +): Promise { + const res = await request.post(`${baseURL}/api/pending-hosts/${pendingID}/accept`, { + headers: { cookie, 'content-type': 'application/json' }, + data: { + repo_url: repo.url, + repo_username: repo.username ?? '', + repo_password: repo.password, + }, + }); + if (!res.ok()) { + throw new Error(`accept: ${res.status()} ${await res.text()}`); + } +} + +export async function listHosts(request: APIRequestContext, cookie: string): Promise { + const res = await request.get(`${baseURL}/api/hosts`, { headers: { cookie } }); + if (!res.ok()) throw new Error(`list hosts: ${res.status()} ${await res.text()}`); + const body = (await res.json()) as { items?: HostJSON[]; hosts?: HostJSON[] }; + return body.items ?? body.hosts ?? []; +} + +export async function waitForHostStatus( + request: APIRequestContext, + cookie: string, + matcher: (h: HostJSON) => boolean, + timeoutMs = 60_000, +): Promise { + const deadline = Date.now() + timeoutMs; + let last: HostJSON | undefined; + while (Date.now() < deadline) { + const hosts = await listHosts(request, cookie); + const hit = hosts.find(matcher); + if (hit) return hit; + last = hosts[0]; + await new Promise((r) => setTimeout(r, 1_000)); + } + throw new Error(`waitForHostStatus: timeout. Last seen: ${JSON.stringify(last)}`); +} + +export async function getSessionCookie(page: Page): Promise { + const cookies = await page.context().cookies(); + const c = cookies.find((c) => c.name === 'rm_session'); + if (!c) throw new Error('rm_session cookie not set after login'); + return `${c.name}=${c.value}`; +} diff --git a/e2e/playwright/tests/smoke.spec.ts b/e2e/playwright/tests/smoke.spec.ts new file mode 100644 index 0000000..0dbd307 --- /dev/null +++ b/e2e/playwright/tests/smoke.spec.ts @@ -0,0 +1,80 @@ +// End-to-end smoke: bootstrap → accept pending host → run backup → see succeeded. +// +// The compose stack stands up a server, a sibling rest-server, and an +// agent in announce-and-approve mode. This test drives the operator +// path through the UI (login + dashboard) and the API +// (accept + run-now + poll for terminal) — UI for the human surfaces, +// API for the deterministic ones. + +import { test, expect } from '@playwright/test'; +import { + baseURL, + bootstrapAdmin, + loginViaUI, + waitForPendingHostID, + acceptPending, + waitForHostStatus, + getSessionCookie, +} from './lib/server'; + +test.describe('smoke: enrol-via-announce → backup', () => { + test('happy path completes in under a minute', async ({ page, request }) => { + const { username, password } = await bootstrapAdmin(request); + await loginViaUI(page, username, password); + + // Dashboard renders. + await expect(page.locator('main')).toContainText(/host|fleet|pending/i, { timeout: 10_000 }); + + // Pending host appears (the agent container has been + // announcing since startup). + const pendingID = await waitForPendingHostID(page); + const cookie = await getSessionCookie(page); + + // Accept with the rest-server creds. compose's rest-server runs + // --no-auth, so any credentials work; restic still demands a + // password to encrypt the repo. + await acceptPending(request, cookie, pendingID, { + url: 'rest:http://rest-server:8000/', + password: 'e2e-repo-password', + }); + + // Wait for the host to come online + auto-init to land. + const onlineHost = await waitForHostStatus( + request, cookie, + (h) => h.status === 'online', + 60_000, + ); + expect(onlineHost.id).toBeTruthy(); + + // Trigger a backup via the UI form-post (HX-Redirect to /jobs/{id}). + await page.goto(`${baseURL}/hosts/${onlineHost.id}`); + await Promise.all([ + page.waitForURL(/\/jobs\//), + page.locator('form[action$="/run-backup"] button[type="submit"]').first().click(), + ]); + + // Wait for the host's last_backup_status to flip to 'succeeded'. + // The job page itself is harder to assert on (it uses + // server-pushed updates and a reload-on-finish pattern); the + // host record is the source of truth and is what the dashboard + // surfaces. + const finishedHost = await waitForHostStatus( + request, cookie, + (h) => h.id === onlineHost.id && h.last_backup_status === 'succeeded', + 120_000, + ); + expect(finishedHost.last_backup_status).toBe('succeeded'); + }); +}); + +test.describe('smoke: scrape /metrics', () => { + test('metrics endpoint exposes the host gauge', async ({ request }) => { + // Compose sets RM_METRICS_TRUSTED_CIDR=0.0.0.0/0 so the + // endpoint is open to the test runner. + const res = await request.get(`${baseURL}/metrics`); + expect(res.status()).toBe(200); + const body = await res.text(); + expect(body).toContain('rm_hosts_total'); + expect(body).toContain('rm_build_info{'); + }); +}); diff --git a/tasks.md b/tasks.md index a696930..843a2b6 100644 --- a/tasks.md +++ b/tasks.md @@ -326,12 +326,54 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days. ## Phase 5 — OSS readiness -- [ ] **P5-01** (M) Documentation site (mdBook or similar) with install, concepts, security model, screenshots -- [ ] **P5-02** (S) `CONTRIBUTING.md`, `CODE_OF_CONDUCT.md`, issue + PR templates +- [x] **P5-01** (M) Documentation site (mdBook or similar) with install, concepts, security model, screenshots +- [x] **P5-02** (S) `CONTRIBUTING.md`, `CODE_OF_CONDUCT.md`, issue + PR templates - [x] **P5-03** (S) Release automation — **pivoted away from goreleaser/binary archives** on 2026-05-05 (spec: `docs/superpowers/specs/2026-05-05-p5-03-docker-only-release.md`). Single deliverable per tag: a multi-arch (linux amd64+arm64) server image, with cross-compiled agent binaries (linux amd64+arm64, windows amd64) + `install.sh` + `install.ps1` + the systemd unit baked under `/opt/restic-manager/dist/`. The `/agent/binary` and `/install/*` handlers fall back from `/...` to `/...` so a fresh container Just Works. Workflow `.gitea/workflows/release.yml` triggers on `v*.*.*` tag-push (real release: fan-out `:vX.Y.Z`, `:X.Y`, `:X`, plus `:latest` once `MAJOR>=1`) and `workflow_dispatch` (snapshot: `:snapshot-` only). Pushed to the Gitea container registry on this instance — no external creds, no GHCR mirror. Cosign / SBOM / minisign / GHCR mirror deferred to Phase 6. Source builds via `make build` remain a first-class path. -- [ ] **P5-04** (S) Demo screenshots / short Loom walkthrough in README -- [ ] **P5-05** (S) `SECURITY.md` with disclosure process -- [ ] **P5-06** (M) End-to-end test suite in CI (Playwright vs. compose stack with sibling Linux agent) +- [x] **P5-04** (S) Demo screenshots / short Loom walkthrough in README +- [x] **P5-05** (S) `SECURITY.md` with disclosure process +- [x] **P5-06** (M) End-to-end test suite in CI (Playwright vs. compose stack with sibling Linux agent) + +> **As shipped (2026-05-07, branch `p5-oss-readiness`):** +> +> **P5-01 — docs site.** mdBook under `docs/book/` with structured +> chapters: getting-started (install, enrolling hosts, reverse +> proxy), concepts (architecture, credentials, schedules + source +> groups, repo maintenance), operations (backups + restores, alerts, +> observability, updates), security (threat model, hardening, +> disclosure), reference (env vars, HTTP endpoints), plus +> contributing / roadmap / license pages. mdBook binary downloaded +> via Makefile (`make docs` / `make docs-watch`) — same "static +> binary, no toolchain" pattern as Tailwind. Generated `book/` +> dir gitignored. +> +> **P5-02 — CONTRIBUTING + CoC + templates.** `CONTRIBUTING.md` +> rewritten from placeholder to full guide (setup, conventions, +> workflow, RBAC of the project itself). `CODE_OF_CONDUCT.md` +> shaped on the Contributor Covenant but adapted for a +> single-maintainer project. `.gitea/issue_template/{bug_report,feature_request}.md` +> + `.gitea/PULL_REQUEST_TEMPLATE.md`. +> +> **P5-04 — README screenshots.** Six full-page captures from a +> fresh server bootstrap under `docs/screenshots/` (login, empty +> dashboard, add host, alerts, settings, audit log). README +> rewritten to centre the screenshot grid + link out to docs site. +> Captured live from a working build via Playwright; replaceable +> as the UI evolves without breaking layout. +> +> **P5-05 — SECURITY.md.** Disclosure policy (3-day ack, 30-day +> default disclosure window), supported-versions matrix, scope +> in/out, threat-model summary, hardening checklist for +> operators. Mirrored as a chapter in the docs site. +> +> **P5-06 — e2e harness.** `e2e/compose.e2e.yml` stands up +> server + sibling Linux agent (alpine + restic) + restic/rest-server +> backend, with announce-and-approve as the enrolment path so +> Playwright drives the operator flow end-to-end. Tests under +> `e2e/playwright/tests/`: smoke spec covers bootstrap → login → +> accept-pending → backup → terminal-status; second spec scrapes +> `/metrics` to verify the P6-04 endpoint. New +> `.gitea/workflows/e2e.yml` runs on every PR (separate from the +> fast lint/test workflow). Local how-to in `docs/e2e.md`. - [x] **P5-07** (S) Reference deployment landed alongside P5-03. `deploy/docker-compose.yml` stands up *only* the server (image-pinned via `RM_VERSION`, named volume for operator state, bound to localhost) — TLS termination is left to whichever reverse proxy the operator already runs. `docs/reverse-proxy.md` documents the headers + WebSocket pass-through the proxy must forward, the `RM_TRUSTED_PROXY` CIDR rule, and worked examples for Caddy, nginx, and Traefik. ### Phase 5 acceptance