testing: bootstrap UI, agent reliability, NS-01..04 + alert username
CI / Test (rest) (pull_request) Successful in 29s
CI / Lint (pull_request) Successful in 32s
CI / Build (windows/amd64) (pull_request) Successful in 22s
CI / Test (store) (pull_request) Successful in 1m22s
CI / Test (server-http) (pull_request) Successful in 1m30s
CI / Build (linux/amd64) (pull_request) Successful in 22s
CI / Build (linux/arm64) (pull_request) Successful in 41s
CI / Test (rest) (pull_request) Successful in 29s
CI / Lint (pull_request) Successful in 32s
CI / Build (windows/amd64) (pull_request) Successful in 22s
CI / Test (store) (pull_request) Successful in 1m22s
CI / Test (server-http) (pull_request) Successful in 1m30s
CI / Build (linux/amd64) (pull_request) Successful in 22s
CI / Build (linux/arm64) (pull_request) Successful in 41s
Smoothes the rough edges that came up exercising a live deployment.
First-run bootstrap UI: /bootstrap renders a username + password form
that uses the in-memory token directly (operator no longer copies it
out of the log); /login redirects there while bootstrap is available.
Agent reliability: failJob synthetic envelopes so command.run early
returns no longer hang the server-side job; runtime probe of restic
restore --help drives --no-ownership instead of version sniffing
(0.18.x had it removed). Server unit re-shaped: ProtectSystem=full
plus ReadWritePaths=/etc/restic-manager, no ProtectHome — restore
can now write anywhere a user might want.
Restore wizard: default target is /root/rm-restore/<job-id>/ with
clearer help text. Re-init confirm input uses .field (was .input,
which doesn't exist — text was invisible).
NS-01 host delete: store DeleteHost, admin-band /hosts/{id}/delete
with hostname-confirm danger zone, audit, FK cascade, live WS close.
NS-02 enrollment-token recovery: outstanding-tokens panel on
/hosts/new, regenerate (preserves attachments) and revoke handlers
+ audit, store-level ListOutstandingEnrollmentTokens and
DeleteEnrollmentToken.
NS-03 repo init / probe surface: migration 0020 adds
hosts.repo_status + repo_status_error; WS handler projects every
init job's outcome onto the host row (idempotent already-initialised
collapses to ready); creds-save resets status and dispatches a fresh
probe; /hosts/{id}/repo/probe retry endpoint with banner.
NS-04 dashboard live + sort + filter: query-string filter
(q/status/repo_status/tag/sort/dir), 5s htmx live poll mirroring the
alerts pattern with a localStorage live toggle, sortable column
headers, filter row + clear.
Alerts page: ack'd-by line resolves user_id ULID to username.
Compose.yaml ignored — host-specific.
This commit is contained in:
+68
-22
@@ -115,6 +115,12 @@ func run() error {
|
||||
|
||||
resticBin, _ := restic.Locate(cfg.ResticPath) // empty is fine; commands fail with a clear error later
|
||||
|
||||
// Probe the actual restic binary for restore-flag support. We used
|
||||
// to gate --no-ownership on a SemVer comparison (added in 0.17),
|
||||
// but a restic 0.18.1 build was observed in the wild that still
|
||||
// rejects the flag. The help text is the only reliable signal.
|
||||
resticSupportsNoOwnership := restic.SupportsRestoreNoOwnership(ctx, resticBin)
|
||||
|
||||
// Open the secrets store. If the agent is enrolled but has no
|
||||
// secrets key yet (legacy YAML), mint one and migrate any
|
||||
// plaintext repo fields into the encrypted blob.
|
||||
@@ -139,10 +145,11 @@ func run() error {
|
||||
}
|
||||
|
||||
d := &dispatcher{
|
||||
resticBin: resticBin,
|
||||
resticVer: snap.ResticVersion,
|
||||
secrets: sec,
|
||||
scheduler: scheduler.New(),
|
||||
resticBin: resticBin,
|
||||
resticVer: snap.ResticVersion,
|
||||
resticSupportsNoOwnership: resticSupportsNoOwnership,
|
||||
secrets: sec,
|
||||
scheduler: scheduler.New(),
|
||||
}
|
||||
if err := wsclient.Run(ctx, wsCfg, d.handle); err != nil {
|
||||
return fmt.Errorf("ws run: %w", err)
|
||||
@@ -204,10 +211,11 @@ func openSecretsStore(cfg *config.Config) (*secrets.Store, error) {
|
||||
// secrets store on each job — config.update writes through to disk,
|
||||
// so a job dispatched in the same session sees the latest values.
|
||||
type dispatcher struct {
|
||||
resticBin string
|
||||
resticVer string // e.g. "0.17.1"; empty if restic isn't installed yet
|
||||
secrets *secrets.Store
|
||||
scheduler *scheduler.Scheduler
|
||||
resticBin string
|
||||
resticVer string // e.g. "0.17.1"; empty if restic isn't installed yet
|
||||
resticSupportsNoOwnership bool // captured at startup from `restic restore --help`
|
||||
secrets *secrets.Store
|
||||
scheduler *scheduler.Scheduler
|
||||
|
||||
// Bandwidth caps in KB/s pushed via config.update. Mutated under
|
||||
// bwMu by the config.update handler; read by runJob when building
|
||||
@@ -464,17 +472,47 @@ func (d *dispatcher) handleTreeList(ctx context.Context, reqID string, p api.Tre
|
||||
reply(api.TreeListResultPayload{Entries: apiEntries})
|
||||
}
|
||||
|
||||
// failJob ships a synthetic job.started + job.finished(failed) pair
|
||||
// for a command.run we couldn't even spawn locally — missing restic
|
||||
// binary, missing credentials, or a malformed payload. Without these
|
||||
// envelopes the server has no way to know the job will never produce
|
||||
// output: the row sits in "running", the live stream stays stuck on
|
||||
// "awaiting agent output," and a subsequent command.cancel arrives
|
||||
// for a job_id the agent never registered (we log "unknown job"
|
||||
// because trackJob was never called). Sending a terminal envelope
|
||||
// here closes the loop on both fronts.
|
||||
func failJob(p api.CommandRunPayload, tx wsclient.Sender, errMsg string) {
|
||||
now := time.Now().UTC()
|
||||
if startedEnv, err := api.Marshal(api.MsgJobStarted, p.JobID, api.JobStartedPayload{
|
||||
JobID: p.JobID, Kind: p.Kind, StartedAt: now,
|
||||
}); err == nil {
|
||||
_ = tx.Send(startedEnv)
|
||||
}
|
||||
if finEnv, err := api.Marshal(api.MsgJobFinished, p.JobID, api.JobFinishedPayload{
|
||||
JobID: p.JobID,
|
||||
Status: api.JobFailed,
|
||||
ExitCode: -1,
|
||||
FinishedAt: now,
|
||||
Error: errMsg,
|
||||
}); err == nil {
|
||||
_ = tx.Send(finEnv)
|
||||
}
|
||||
}
|
||||
|
||||
// runJob spawns a runner for one job. We launch a goroutine so the
|
||||
// WS read loop keeps draining messages while restic chugs along.
|
||||
func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsclient.Sender) error {
|
||||
if d.resticBin == "" {
|
||||
failJob(p, tx, "restic binary not located on this agent")
|
||||
return fmt.Errorf("restic binary not located on this agent")
|
||||
}
|
||||
creds, err := d.secrets.Load()
|
||||
if err != nil {
|
||||
failJob(p, tx, "load repo credentials: "+err.Error())
|
||||
return fmt.Errorf("load repo credentials: %w", err)
|
||||
}
|
||||
if creds.Empty() {
|
||||
failJob(p, tx, "repo credentials not configured (waiting for server config.update push)")
|
||||
return fmt.Errorf("repo credentials not configured (waiting for server config.update push)")
|
||||
}
|
||||
// r is the everyday runner — bound to the host's repo
|
||||
@@ -498,13 +536,14 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
||||
}
|
||||
|
||||
r := runner.New(runner.Config{
|
||||
ResticBin: d.resticBin,
|
||||
ResticVersion: d.resticVer,
|
||||
RepoURL: creds.URL,
|
||||
RepoUsername: creds.Username,
|
||||
RepoPassword: creds.Password,
|
||||
LimitUploadKBps: upKBps,
|
||||
LimitDownloadKBps: downKBps,
|
||||
ResticBin: d.resticBin,
|
||||
ResticVersion: d.resticVer,
|
||||
RepoURL: creds.URL,
|
||||
RepoUsername: creds.Username,
|
||||
RepoPassword: creds.Password,
|
||||
SupportsRestoreNoOwnership: d.resticSupportsNoOwnership,
|
||||
LimitUploadKBps: upKBps,
|
||||
LimitDownloadKBps: downKBps,
|
||||
}, tx, time.Second)
|
||||
|
||||
// spawn wraps the kind-specific goroutine: derives a per-job
|
||||
@@ -560,6 +599,7 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
||||
// policy fallback was specced but skipped — see the
|
||||
// Phase 5 plan rationale and version.go's lockstep-deploy
|
||||
// note for why.
|
||||
failJob(p, tx, "forget: command.run carried no forget_groups (server didn't populate them)")
|
||||
return fmt.Errorf("forget: command.run carried no forget_groups (server didn't populate them)")
|
||||
}
|
||||
groups := make([]restic.ForgetGroup, 0, len(p.ForgetGroups))
|
||||
@@ -594,13 +634,14 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
||||
runCreds = ac
|
||||
}
|
||||
prr := runner.New(runner.Config{
|
||||
ResticBin: d.resticBin,
|
||||
ResticVersion: d.resticVer,
|
||||
RepoURL: runCreds.URL,
|
||||
RepoUsername: runCreds.Username,
|
||||
RepoPassword: runCreds.Password,
|
||||
LimitUploadKBps: upKBps,
|
||||
LimitDownloadKBps: downKBps,
|
||||
ResticBin: d.resticBin,
|
||||
ResticVersion: d.resticVer,
|
||||
RepoURL: runCreds.URL,
|
||||
RepoUsername: runCreds.Username,
|
||||
RepoPassword: runCreds.Password,
|
||||
SupportsRestoreNoOwnership: d.resticSupportsNoOwnership,
|
||||
LimitUploadKBps: upKBps,
|
||||
LimitDownloadKBps: downKBps,
|
||||
}, tx, time.Second)
|
||||
slog.Info("agent: accepting prune job", "job_id", p.JobID, "admin_creds", p.RequiresAdminCreds)
|
||||
spawn("prune", func(jobCtx context.Context) error {
|
||||
@@ -622,13 +663,16 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
||||
})
|
||||
case api.JobRestore:
|
||||
if p.Restore == nil {
|
||||
failJob(p, tx, "restore: command.run carried no restore payload")
|
||||
return fmt.Errorf("restore: command.run carried no restore payload")
|
||||
}
|
||||
rp := *p.Restore
|
||||
if rp.SnapshotID == "" {
|
||||
failJob(p, tx, "restore: snapshot_id is required")
|
||||
return fmt.Errorf("restore: snapshot_id is required")
|
||||
}
|
||||
if !rp.InPlace && rp.TargetDir == "" {
|
||||
failJob(p, tx, "restore: target_dir required for non-in-place restore")
|
||||
return fmt.Errorf("restore: target_dir required for non-in-place restore")
|
||||
}
|
||||
slog.Info("agent: accepting restore job",
|
||||
@@ -639,6 +683,7 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
||||
})
|
||||
case api.JobDiff:
|
||||
if p.Diff == nil || p.Diff.SnapshotA == "" || p.Diff.SnapshotB == "" {
|
||||
failJob(p, tx, "diff: command.run carried incomplete diff payload")
|
||||
return fmt.Errorf("diff: command.run carried incomplete diff payload")
|
||||
}
|
||||
dp := *p.Diff
|
||||
@@ -648,6 +693,7 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
||||
return r.RunDiff(jobCtx, p.JobID, dp.SnapshotA, dp.SnapshotB)
|
||||
})
|
||||
default:
|
||||
failJob(p, tx, fmt.Sprintf("kind %q not implemented on this agent", p.Kind))
|
||||
return fmt.Errorf("kind %q not implemented yet (Phase 2 lands the rest)", p.Kind)
|
||||
}
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user