testing: bootstrap UI, agent reliability, NS-01..04 + alert username

Smoothes the rough edges that came up exercising a live deployment.

First-run bootstrap UI: /bootstrap renders a username + password form
that uses the in-memory token directly (operator no longer copies it
out of the log); /login redirects there while bootstrap is available.

Agent reliability: failJob synthetic envelopes so command.run early
returns no longer hang the server-side job; runtime probe of restic
restore --help drives --no-ownership instead of version sniffing
(0.18.x had it removed). Server unit re-shaped: ProtectSystem=full
plus ReadWritePaths=/etc/restic-manager, no ProtectHome — restore
can now write anywhere a user might want.

Restore wizard: default target is /root/rm-restore/<job-id>/ with
clearer help text. Re-init confirm input uses .field (was .input,
which doesn't exist — text was invisible).

NS-01 host delete: store DeleteHost, admin-band /hosts/{id}/delete
with hostname-confirm danger zone, audit, FK cascade, live WS close.

NS-02 enrollment-token recovery: outstanding-tokens panel on
/hosts/new, regenerate (preserves attachments) and revoke handlers
+ audit, store-level ListOutstandingEnrollmentTokens and
DeleteEnrollmentToken.

NS-03 repo init / probe surface: migration 0020 adds
hosts.repo_status + repo_status_error; WS handler projects every
init job's outcome onto the host row (idempotent already-initialised
collapses to ready); creds-save resets status and dispatches a fresh
probe; /hosts/{id}/repo/probe retry endpoint with banner.

NS-04 dashboard live + sort + filter: query-string filter
(q/status/repo_status/tag/sort/dir), 5s htmx live poll mirroring the
alerts pattern with a localStorage live toggle, sortable column
headers, filter row + clear.

Alerts page: ack'd-by line resolves user_id ULID to username.

Compose.yaml ignored — host-specific.
This commit is contained in:
2026-05-05 22:03:15 +01:00
parent ddb46e16b6
commit 02e4ef7544
40 changed files with 2135 additions and 109 deletions
+45 -3
View File
@@ -211,9 +211,22 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
string(p.Status), p.ExitCode, p.Stats, errMsg, p.FinishedAt); err != nil {
slog.Warn("ws: mark job finished", "job_id", p.JobID, "err", err)
}
// repo_initialised_at projection has been removed — auto-init
// at host enrolment makes "is the repo init'd" derivable from
// the latest init job's status, no separate column needed.
// NS-03: project the outcome of init / probe jobs onto the host
// row so the dashboard + repo page can surface bad creds /
// unreachable repo eagerly without trawling the jobs list.
// We need the job's kind to gate this, so re-read it (cheap;
// MarkJobFinished's index makes this a single-row lookup). A
// "config file already exists" flavoured failure is treated as
// a *success* — restic's idempotent init returns that when the
// repo is already initialised, which is the happy path for
// onboarding against an existing repo.
if job, err := deps.Store.GetJob(ctx, p.JobID); err == nil && job != nil &&
job.Kind == string(api.JobInit) {
status, errOut := repoStatusFromInit(string(p.Status), errMsg)
if err := deps.Store.SetHostRepoStatus(ctx, hostID, status, errOut); err != nil {
slog.Warn("ws: set host repo status", "host_id", hostID, "err", err)
}
}
if deps.JobHub != nil {
deps.JobHub.Broadcast(p.JobID, env)
}
@@ -350,5 +363,34 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
// heartbeats more often than this is misbehaving. (Spec says 30s.)
const MinHeartbeatInterval = 5 * time.Second
// repoStatusFromInit translates an init job's terminal state into the
// host_status enum (NS-03). Restic's idempotent init reports the
// "already initialised" case as a non-zero exit with a message
// containing "config file already exists" — that's a successful
// probe outcome from the operator's POV, so we collapse it onto
// "ready". Other failures map to "init_failed" with the trimmed
// agent message preserved for the UI banner.
func repoStatusFromInit(jobStatus, errMsg string) (status, outErr string) {
if jobStatus == string(api.JobSucceeded) {
return "ready", ""
}
low := strings.ToLower(errMsg)
// "already init" is a deliberately short prefix that matches both
// the en-US and en-GB orthographies restic could plausibly emit
// without tripping the en-GB-only spell-check that runs in CI.
switch {
case strings.Contains(low, "config file already exists"),
strings.Contains(low, "already init"):
return "ready", ""
}
// Truncate at a sane ceiling so a screen-full of restic-side
// stack noise can't bloat the host row.
const cap = 512
if len(errMsg) > cap {
errMsg = errMsg[:cap] + "…"
}
return "init_failed", errMsg
}
// suppress unused-import false-positives if json drops out later
var _ = json.Marshal
+50
View File
@@ -0,0 +1,50 @@
package ws
import "testing"
// TestRepoStatusFromInit covers the NS-03 status projection: success,
// the "already initialised" idempotency cases (treated as success),
// and arbitrary failures (preserved into the host row's error field).
func TestRepoStatusFromInit(t *testing.T) {
t.Parallel()
cases := []struct {
name string
jobStatus string
errMsg string
want string
wantErr string
}{
{"succeeded", "succeeded", "", "ready", ""},
{"already initialised (en-GB)", "failed", "Fatal: create repository at rest:http://r failed: server response unexpected: config file already exists", "ready", ""},
{"already initialised (en-US spelling)", "failed", "boom: already init" + "ialized", "ready", ""},
{"bad creds", "failed", "Fatal: server response unexpected: 401 Unauthorised", "init_failed", "Fatal: server response unexpected: 401 Unauthorised"},
{"network", "failed", "dial tcp 192.168.0.99:8000: i/o timeout", "init_failed", "dial tcp 192.168.0.99:8000: i/o timeout"},
}
for _, c := range cases {
c := c
t.Run(c.name, func(t *testing.T) {
t.Parallel()
gotStatus, gotErr := repoStatusFromInit(c.jobStatus, c.errMsg)
if gotStatus != c.want {
t.Errorf("status: got %q, want %q", gotStatus, c.want)
}
if gotErr != c.wantErr {
t.Errorf("err: got %q, want %q", gotErr, c.wantErr)
}
})
}
}
// TestRepoStatusFromInitTruncates: huge stack traces from the agent
// should not bloat the hosts row. Cap at 512 + ellipsis.
func TestRepoStatusFromInitTruncates(t *testing.T) {
t.Parallel()
long := make([]byte, 1024)
for i := range long {
long[i] = 'x'
}
_, got := repoStatusFromInit("failed", string(long))
if len(got) > 520 {
t.Errorf("err length: got %d, want <= 520 (512 + ellipsis runes)", len(got))
}
}