Files
restic-manager/internal/restic/runner.go
T
steve 3800b34a2b
CI / Test (rest) (pull_request) Successful in 29s
CI / Lint (pull_request) Successful in 32s
CI / Build (windows/amd64) (pull_request) Successful in 22s
CI / Test (store) (pull_request) Successful in 1m22s
CI / Test (server-http) (pull_request) Successful in 1m30s
CI / Build (linux/amd64) (pull_request) Successful in 22s
CI / Build (linux/arm64) (pull_request) Successful in 41s
testing: bootstrap UI, agent reliability, NS-01..04 + alert username
Smoothes the rough edges that came up exercising a live deployment.

First-run bootstrap UI: /bootstrap renders a username + password form
that uses the in-memory token directly (operator no longer copies it
out of the log); /login redirects there while bootstrap is available.

Agent reliability: failJob synthetic envelopes so command.run early
returns no longer hang the server-side job; runtime probe of restic
restore --help drives --no-ownership instead of version sniffing
(0.18.x had it removed). Server unit re-shaped: ProtectSystem=full
plus ReadWritePaths=/etc/restic-manager, no ProtectHome — restore
can now write anywhere a user might want.

Restore wizard: default target is /root/rm-restore/<job-id>/ with
clearer help text. Re-init confirm input uses .field (was .input,
which doesn't exist — text was invisible).

NS-01 host delete: store DeleteHost, admin-band /hosts/{id}/delete
with hostname-confirm danger zone, audit, FK cascade, live WS close.

NS-02 enrollment-token recovery: outstanding-tokens panel on
/hosts/new, regenerate (preserves attachments) and revoke handlers
+ audit, store-level ListOutstandingEnrollmentTokens and
DeleteEnrollmentToken.

NS-03 repo init / probe surface: migration 0020 adds
hosts.repo_status + repo_status_error; WS handler projects every
init job's outcome onto the host row (idempotent already-initialised
collapses to ready); creds-save resets status and dispatches a fresh
probe; /hosts/{id}/repo/probe retry endpoint with banner.

NS-04 dashboard live + sort + filter: query-string filter
(q/status/repo_status/tag/sort/dir), 5s htmx live poll mirroring the
alerts pattern with a localStorage live toggle, sortable column
headers, filter row + clear.

Alerts page: ack'd-by line resolves user_id ULID to username.

Compose.yaml ignored — host-specific.
2026-05-05 22:03:15 +01:00

635 lines
21 KiB
Go

// Package restic wraps the restic CLI: locate the binary, run it
// with --json, parse streamed events. The agent calls this; the
// control-plane never invokes restic.
package restic
import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"os/exec"
"strings"
"time"
)
// SupportsRestoreNoOwnership probes the running restic for the
// `--no-ownership` flag on the `restore` subcommand. Some restic
// builds (≥ 0.17 in theory; observed missing on a downstream 0.18.1)
// do not expose it, so we ask the binary directly rather than
// inferring from the version string. Empty `bin` or any failure to
// run the help command returns false — the caller stays on the
// conservative path of not adding the flag.
func SupportsRestoreNoOwnership(ctx context.Context, bin string) bool {
if bin == "" {
return false
}
probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
out, err := exec.CommandContext(probeCtx, bin, "restore", "--help").CombinedOutput()
if err != nil {
return false
}
return strings.Contains(string(out), "--no-ownership")
}
// Locate resolves the path to the restic binary. Honour an explicit
// override if provided, else fall back to PATH.
func Locate(override string) (string, error) {
if override != "" {
if _, err := exec.LookPath(override); err == nil {
return override, nil
}
return "", fmt.Errorf("restic: configured path %q not executable", override)
}
bin, err := exec.LookPath("restic")
if err != nil {
return "", fmt.Errorf("restic: not on PATH: %w", err)
}
return bin, nil
}
// Env is the per-invocation context for a restic command.
//
// RepoURL is the bare URL as the operator typed it — no embedded
// credentials. RepoUsername (optional) carries the HTTP basic-auth
// user for `rest:` repos. The merged URL (with `user:pass@host`
// embedded) is built once inside envSlice() at the moment of exec
// and fed straight to the subprocess via RESTIC_REPOSITORY; we
// never assign it back to Env, never pass it to slog. If anything
// in this package ever needs to *log* a URL, use RedactURL.
type Env struct {
Bin string // path to restic binary
Version string // e.g. "0.17.1"; empty if unknown
RepoURL string // RESTIC_REPOSITORY (no embedded creds)
RepoUsername string // optional HTTP basic-auth user for rest: URLs
RepoPassword string // doubles as RESTIC_PASSWORD and (for rest:) HTTP basic-auth password
ExtraEnv map[string]string // any other RESTIC_* / passthrough
WorkDir string // CWD; default = current
// SupportsRestoreNoOwnership records whether the running restic's
// `restore --help` advertises the --no-ownership flag. The flag was
// added in 0.17, but at least one downstream build of 0.18.1 still
// rejects it ("unknown flag: --no-ownership") — version sniffing
// proved unreliable, so the agent now probes for the actual flag at
// startup (see internal/restic.SupportsRestoreNoOwnership) and
// passes the resulting boolean down here.
SupportsRestoreNoOwnership bool
// Bandwidth caps in KB/s. <=0 means "no cap" (omit the flag).
// Emitted as restic global flags --limit-upload / --limit-download
// before the subcommand on every invocation.
LimitUploadKBps int
LimitDownloadKBps int
}
// AtLeastVersion reports whether e.Version >= the given major/minor.
// Comparison is best-effort: empty / unparseable versions return false
// (callers stay on the conservative path). Patch level is ignored.
func (e Env) AtLeastVersion(major, minor int) bool {
v := strings.TrimSpace(e.Version)
if v == "" {
return false
}
parts := strings.SplitN(v, ".", 3)
if len(parts) < 2 {
return false
}
maj, err1 := atoi(parts[0])
min, err2 := atoi(parts[1])
if err1 != nil || err2 != nil {
return false
}
if maj != major {
return maj > major
}
return min >= minor
}
// atoi is strconv.Atoi without dragging the import into a file that
// only needs it for one helper.
func atoi(s string) (int, error) {
n := 0
if len(s) == 0 {
return 0, fmt.Errorf("empty")
}
for _, r := range s {
if r < '0' || r > '9' {
return 0, fmt.Errorf("not a digit: %q", r)
}
n = n*10 + int(r-'0')
}
return n, nil
}
// globalArgs returns restic's pre-subcommand global flags derived
// from the Env. Currently just bandwidth caps.
func (e Env) globalArgs() []string {
var out []string
if e.LimitUploadKBps > 0 {
out = append(out, "--limit-upload", fmt.Sprintf("%d", e.LimitUploadKBps))
}
if e.LimitDownloadKBps > 0 {
out = append(out, "--limit-download", fmt.Sprintf("%d", e.LimitDownloadKBps))
}
return out
}
// resticCmd builds an exec.Cmd with bandwidth-limit globals prefixed
// before the supplied subcommand args. Centralising this so every
// command (backup/forget/prune/check/unlock/init/stats) honours
// the caps without each call site having to remember.
//
// Cancellation: by default exec.CommandContext sends SIGKILL when
// ctx is canceled, which leaves restic no chance to clean up its
// repository lock. Override Cmd.Cancel to send SIGTERM first, and
// set Cmd.WaitDelay so the process is force-killed if it doesn't
// exit within five seconds. Restic responds to SIGTERM by removing
// its lock file before exiting, which is what we want when an
// operator cancels a long-running backup/restore from the UI.
func (e Env) resticCmd(ctx context.Context, sub ...string) *exec.Cmd {
args := append(e.globalArgs(), sub...)
cmd := exec.CommandContext(ctx, e.Bin, args...)
cmd.Env = e.envSlice()
cmd.Dir = e.WorkDir
cmd.Cancel = func() error {
// Cmd.Process is set after Start; Cancel only fires post-Start
// so the nil check is defensive against the documented but
// unlikely race. Signal returns ErrProcessDone if the process
// already exited; that's not a problem here either.
if cmd.Process == nil {
return nil
}
return cmd.Process.Signal(sigterm)
}
cmd.WaitDelay = 5 * time.Second
return cmd
}
// EventKind enumerates what we care about in restic's --json output
// for `backup`. Restic's other commands emit different shapes; we
// switch on message_type.
type EventKind string
// Known message_type values restic --json emits during a backup.
// Kept as constants so callers can switch without typo risk.
const (
EventStatus EventKind = "status" // periodic progress
EventVerbose EventKind = "verbose_status"
EventSummary EventKind = "summary" // emitted once at end of backup
EventErrorEvent EventKind = "error"
)
// BackupStatus mirrors the JSON status emitted by `restic backup`.
type BackupStatus struct {
MessageType string `json:"message_type"`
PercentDone float64 `json:"percent_done"`
TotalFiles int64 `json:"total_files"`
FilesDone int64 `json:"files_done"`
TotalBytes int64 `json:"total_bytes"`
BytesDone int64 `json:"bytes_done"`
SecondsElapsed int64 `json:"seconds_elapsed"`
SecondsRem int64 `json:"seconds_remaining"`
}
// BackupSummary mirrors the JSON summary block.
type BackupSummary struct {
MessageType string `json:"message_type"`
FilesNew int64 `json:"files_new"`
FilesChanged int64 `json:"files_changed"`
FilesUnmodified int64 `json:"files_unmodified"`
DirsNew int64 `json:"dirs_new"`
DirsChanged int64 `json:"dirs_changed"`
DirsUnmodified int64 `json:"dirs_unmodified"`
DataAdded int64 `json:"data_added"`
TotalFilesProcessed int64 `json:"total_files_processed"`
TotalBytesProcessed int64 `json:"total_bytes_processed"`
TotalDuration float64 `json:"total_duration"`
SnapshotID string `json:"snapshot_id"`
}
// LineHandler receives every stdout/stderr line. event is non-nil
// when the line is a recognised JSON status; raw always carries the
// original text (so we can also tee to job_logs as `stdout`).
type LineHandler func(stream string, raw string, event any)
// RunBackup executes `restic backup [paths...]` with --json and pumps
// status/summary into handle. Returns nil on success (exit code 0
// or 3 — 3 means "completed but had issues"; restic considers it a
// success). Other exit codes propagate as an error.
func (e Env) RunBackup(ctx context.Context, paths, excludes, tags []string, handle LineHandler) (*BackupSummary, error) {
args := []string{"backup", "--json"}
for _, ex := range excludes {
args = append(args, "--exclude", ex)
}
for _, tag := range tags {
args = append(args, "--tag", tag)
}
args = append(args, paths...)
cmd := e.resticCmd(ctx, args...)
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, fmt.Errorf("restic backup: stdout pipe: %w", err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
return nil, fmt.Errorf("restic backup: stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("restic backup: start: %w", err)
}
var summary *BackupSummary
done := make(chan error, 2)
go func() { done <- pumpStdout(stdout, handle, &summary) }()
go func() { done <- pumpStderr(stderr, handle) }()
// Wait for both pumps + the process.
for i := 0; i < 2; i++ {
if err := <-done; err != nil && handle != nil {
handle("event", fmt.Sprintf("pump error: %v", err), nil)
}
}
werr := cmd.Wait()
if werr != nil {
var ee *exec.ExitError
if errors.As(werr, &ee) && ee.ExitCode() == 3 {
// "incomplete backup" — restic still produced a snapshot.
return summary, nil
}
return summary, fmt.Errorf("restic backup: %w", werr)
}
return summary, nil
}
// ForgetPolicy mirrors restic forget's --keep-* flags. All optional;
// nil/zero means "don't pass that flag."
type ForgetPolicy struct {
KeepLast *int
KeepHourly *int
KeepDaily *int
KeepWeekly *int
KeepMonthly *int
KeepYearly *int
}
// args returns the --keep-* CLI flags this policy translates into.
// Empty slice if the policy is empty (caller should reject before
// calling RunForget — restic refuses to forget without any keep-*).
func (p ForgetPolicy) args() []string {
out := []string{}
add := func(flag string, v *int) {
if v != nil {
out = append(out, flag, fmt.Sprintf("%d", *v))
}
}
add("--keep-last", p.KeepLast)
add("--keep-hourly", p.KeepHourly)
add("--keep-daily", p.KeepDaily)
add("--keep-weekly", p.KeepWeekly)
add("--keep-monthly", p.KeepMonthly)
add("--keep-yearly", p.KeepYearly)
return out
}
// Empty reports whether no retention dimensions are set.
func (p ForgetPolicy) Empty() bool {
return p.KeepLast == nil && p.KeepHourly == nil &&
p.KeepDaily == nil && p.KeepWeekly == nil &&
p.KeepMonthly == nil && p.KeepYearly == nil
}
// ForgetGroup is one (tag, retention-policy) pair fed to RunForget.
// The wrapper invokes `restic forget --tag <Tag> --keep-* …` per
// group so retention can be targeted at a single source-group's
// snapshots without disturbing snapshots tagged for other groups.
type ForgetGroup struct {
Tag string
Policy ForgetPolicy
}
// RunForget executes one `restic forget --tag <Tag> --keep-* …`
// invocation per group. Does NOT pass --prune — pruning lives behind
// a separate admin-only credential (see spec §4.3 / P2-06). Restic
// rewrites the snapshot index; the actual data deletion waits for
// the next prune. Empty groups slice is rejected (would be a no-op);
// any group with an empty policy is rejected (restic forget without
// any keep-* would delete every snapshot in the tagged set).
// Returns the first error encountered, or nil when every group runs
// to a clean exit.
func (e Env) RunForget(ctx context.Context, groups []ForgetGroup, handle LineHandler) error {
if len(groups) == 0 {
return fmt.Errorf("restic forget: refusing to run with no groups (would be a no-op)")
}
for _, g := range groups {
if g.Policy.Empty() {
return fmt.Errorf("restic forget: group %q has empty retention policy (would delete every snapshot)", g.Tag)
}
args := []string{"forget", "--json", "--tag", g.Tag}
args = append(args, g.Policy.args()...)
cmd := e.resticCmd(ctx, args...)
if err := runWithPump(cmd, handle); err != nil {
return err
}
}
return nil
}
// RunInit executes `restic init` against the configured repo. Returns
// nil on success. Restic init's output is small and not JSON-rich;
// we tee stdout/stderr verbatim through handle so the operator sees
// the same lines they'd see at the CLI ("created restic repository
// <id> at <url>" on success, "config file already exists" on a
// re-init attempt, etc.).
func (e Env) RunInit(ctx context.Context, handle LineHandler) error {
cmd := e.resticCmd(ctx, "init")
// Sniff for "config file already exists" on stderr; if we see it
// we'll treat the non-zero exit as a soft success — running init
// against an already-initialised repo is a no-op semantically,
// not a failure. Wraps the caller's handle so the line still
// gets streamed verbatim to the operator-facing log.
alreadyInited := false
sniff := func(stream, line string, ev any) {
if stream == "stderr" && strings.Contains(line, "config file already exists") {
alreadyInited = true
}
if handle != nil {
handle(stream, line, ev)
}
}
if err := runWithPump(cmd, sniff); err != nil {
if alreadyInited {
if handle != nil {
handle("event", "repo already initialised — treating as success", nil)
}
return nil
}
return err
}
return nil
}
// RunPrune executes `restic prune` against the configured repo.
// Requires the *admin* credentials (delete access on the rest-server
// repo) — the caller is responsible for populating Env.RepoUsername
// and Env.RepoPassword with the admin pair before calling this.
//
// Prune emits human-readable progress on stdout/stderr (no --json
// support that's useful for our purposes). We tee everything to the
// handler so the live log is the operator's progress bar.
func (e Env) RunPrune(ctx context.Context, handle LineHandler) error {
return runWithPump(e.resticCmd(ctx, "prune"), handle)
}
// runWithPump starts the configured cmd, fans stdout+stderr into
// pumpPlain via the supplied handler, waits, and wraps any error
// with the cmd's verb (e.g., "restic prune") for context.
func runWithPump(cmd *exec.Cmd, handle LineHandler) error {
label := "restic"
if len(cmd.Args) > 1 {
label = "restic " + cmd.Args[1]
}
stdout, err := cmd.StdoutPipe()
if err != nil {
return fmt.Errorf("%s: stdout pipe: %w", label, err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("%s: stderr pipe: %w", label, err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("%s: start: %w", label, err)
}
done := make(chan error, 2)
go func() { done <- pumpPlain(stdout, "stdout", handle) }()
go func() { done <- pumpPlain(stderr, "stderr", handle) }()
for i := 0; i < 2; i++ {
if err := <-done; err != nil && handle != nil {
handle("event", fmt.Sprintf("pump error: %v", err), nil)
}
}
if werr := cmd.Wait(); werr != nil {
return fmt.Errorf("%s: %w", label, werr)
}
return nil
}
// RunUnlock executes `restic unlock`. Returns nil on a clean exit.
func (e Env) RunUnlock(ctx context.Context, handle LineHandler) error {
return runWithPump(e.resticCmd(ctx, "unlock"), handle)
}
// RepoStats mirrors `restic stats --json --mode raw-data` output.
type RepoStats struct {
TotalSize int64 `json:"total_size"`
TotalUncompressed int64 `json:"total_uncompressed_size"`
SnapshotsCount int64 `json:"snapshots_count"`
TotalFileCount int64 `json:"total_file_count"`
TotalBlobCount int64 `json:"total_blob_count"`
}
// RunStats executes `restic stats --json --mode raw-data` and parses
// the (single-line) JSON response. Tees raw output to handle so the
// caller can still log it. Returns an error if no JSON-shaped line
// arrived on stdout.
func (e Env) RunStats(ctx context.Context, handle LineHandler) (*RepoStats, error) {
cmd := e.resticCmd(ctx, "stats", "--json", "--mode", "raw-data")
var out *RepoStats
capture := func(stream, line string, ev any) {
if stream == "stdout" && strings.HasPrefix(line, "{") {
var s RepoStats
if json.Unmarshal([]byte(line), &s) == nil {
cp := s
out = &cp
}
}
if handle != nil {
handle(stream, line, ev)
}
}
if err := runWithPump(cmd, capture); err != nil {
return nil, err
}
if out == nil {
return nil, fmt.Errorf("restic stats: no JSON in output")
}
return out, nil
}
// CheckResult summarises a `restic check` invocation. LockPresent is
// true if the stderr stream contained a stale-lock signal (caller is
// expected to surface this in the UI so the operator can run unlock).
// ErrorsFound is true if check exited with a non-zero status (errors
// detected in repo metadata).
type CheckResult struct {
LockPresent bool
ErrorsFound bool
}
// RunCheck executes `restic check` with optional --read-data-subset.
// subsetPct of 0 omits the flag (full data check); >0 passes
// --read-data-subset N%. Returns a CheckResult summarising what was
// sniffed from stderr; the result is set even if check itself
// returns an error (so the caller can persist last_check_status).
func (e Env) RunCheck(ctx context.Context, subsetPct int, handle LineHandler) (CheckResult, error) {
args := []string{"check"}
if subsetPct > 0 {
args = append(args, "--read-data-subset", fmt.Sprintf("%d%%", subsetPct))
}
cmd := e.resticCmd(ctx, args...)
var res CheckResult
sniff := func(stream, line string, ev any) {
if stream == "stderr" {
if strings.Contains(line, "stale lock") || strings.Contains(line, "already locked") {
res.LockPresent = true
}
}
if handle != nil {
handle(stream, line, ev)
}
}
err := runWithPump(cmd, sniff)
if err != nil {
// restic check exits non-zero when corruption is found; that's
// a CheckResult, not a wrapper failure. Treat ExitError as
// "errors found" but still return the result so the caller can
// persist last_check_status='errors_found'. Reserve the error
// return for actually-broken invocations (binary missing, etc).
var ee *exec.ExitError
if errors.As(err, &ee) {
res.ErrorsFound = true
return res, nil
}
return res, err
}
return res, nil
}
func pumpPlain(r io.Reader, stream string, handle LineHandler) error {
scanner := bufio.NewScanner(r)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for scanner.Scan() {
if handle != nil {
handle(stream, scanner.Text(), nil)
}
}
return scanner.Err()
}
// envSlice converts Env's typed fields into the os/exec env shape.
//
// Deliberately does NOT inherit the parent process's environment:
// any RESTIC_* / AWS_* / B2_* vars in the operator's shell or the
// systemd unit's Environment= clause are filtered out so the
// control-plane is the unambiguous source of truth.
//
// HOME / XDG_CACHE_HOME are set explicitly because restic insists
// on one or the other for its cache dir; without it the command
// fails before ever talking to the repo.
//
// Default to /var/lib/restic-manager. The unit no longer pins
// ProtectHome=read-only (a backup tool needs to restore anywhere),
// but the explicit HOME stays for two reasons: the parent's HOME
// can be unset under unusual init shapes, and pinning the cache
// under a known agent-owned dir keeps restic's metadata isolated
// from the actual operator home dirs that the agent can now write
// to. ExtraEnv overrides win for callers that want a different
// cache location.
func (e Env) envSlice() []string {
home := "/var/lib/restic-manager"
if h, ok := e.ExtraEnv["HOME"]; ok && h != "" {
home = h
}
xdg := home + "/.cache"
if x, ok := e.ExtraEnv["XDG_CACHE_HOME"]; ok && x != "" {
xdg = x
}
out := []string{
"RESTIC_REPOSITORY=" + mergeRestCreds(e.RepoURL, e.RepoUsername, e.RepoPassword),
"RESTIC_PASSWORD=" + e.RepoPassword,
// Feed restic via env-only — keeps creds off ps(1).
"PATH=/usr/local/bin:/usr/bin:/bin",
"HOME=" + home,
"XDG_CACHE_HOME=" + xdg,
}
for k, v := range e.ExtraEnv {
// HOME / XDG_CACHE_HOME already merged in above.
if k == "HOME" || k == "XDG_CACHE_HOME" {
continue
}
out = append(out, k+"="+v)
}
return out
}
func pumpStdout(r io.Reader, handle LineHandler, summary **BackupSummary) error {
scanner := bufio.NewScanner(r)
scanner.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) // status lines can get long
for scanner.Scan() {
line := scanner.Text()
if handle == nil {
continue
}
// Sniff message_type without a full Unmarshal so non-JSON
// lines (very rare on stdout, but possible) survive.
if !strings.HasPrefix(line, "{") {
handle("stdout", line, nil)
continue
}
var probe struct {
MessageType string `json:"message_type"`
}
if err := json.Unmarshal([]byte(line), &probe); err != nil {
handle("stdout", line, nil)
continue
}
switch EventKind(probe.MessageType) {
case EventStatus, EventVerbose:
var ev BackupStatus
if json.Unmarshal([]byte(line), &ev) == nil {
handle("event", line, ev)
continue
}
case EventSummary:
var ev BackupSummary
if json.Unmarshal([]byte(line), &ev) == nil {
if summary != nil {
s := ev
*summary = &s
}
handle("event", line, ev)
continue
}
case EventErrorEvent:
handle("event", line, nil)
continue
}
handle("stdout", line, nil)
}
return scanner.Err()
}
func pumpStderr(r io.Reader, handle LineHandler) error {
scanner := bufio.NewScanner(r)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for scanner.Scan() {
if handle != nil {
handle("stderr", scanner.Text(), nil)
}
}
return scanner.Err()
}
// suppress unused-time false-positive when nothing else in this file
// uses time but the file is part of a package that grows over time
var _ = time.Now