testing: bootstrap UI, agent reliability, NS-01..04 + alert username

Smoothes the rough edges that came up exercising a live deployment. First-run bootstrap UI: /bootstrap renders a username + password form that uses the in-memory token directly (operator no longer copies it out of the log); /login redirects there while bootstrap is available. Agent reliability: failJob synthetic envelopes so command.run early returns no longer hang the server-side job; runtime probe of restic restore --help drives --no-ownership instead of version sniffing (0.18.x had it removed). Server unit re-shaped: ProtectSystem=full plus ReadWritePaths=/etc/restic-manager, no ProtectHome — restore can now write anywhere a user might want. Restore wizard: default target is /root/rm-restore/<job-id>/ with clearer help text. Re-init confirm input uses .field (was .input, which doesn't exist — text was invisible). NS-01 host delete: store DeleteHost, admin-band /hosts/{id}/delete with hostname-confirm danger zone, audit, FK cascade, live WS close. NS-02 enrollment-token recovery: outstanding-tokens panel on /hosts/new, regenerate (preserves attachments) and revoke handlers + audit, store-level ListOutstandingEnrollmentTokens and DeleteEnrollmentToken. NS-03 repo init / probe surface: migration 0020 adds hosts.repo_status + repo_status_error; WS handler projects every init job's outcome onto the host row (idempotent already-initialised collapses to ready); creds-save resets status and dispatches a fresh probe; /hosts/{id}/repo/probe retry endpoint with banner. NS-04 dashboard live + sort + filter: query-string filter (q/status/repo_status/tag/sort/dir), 5s htmx live poll mirroring the alerts pattern with a localStorage live toggle, sortable column headers, filter row + clear. Alerts page: ack'd-by line resolves user_id ULID to username. Compose.yaml ignored — host-specific.
2026-05-05 22:03:15 +01:00
parent ddb46e16b6
commit 02e4ef7544
40 changed files with 2135 additions and 109 deletions
@@ -160,6 +160,78 @@ func (s *Store) GetEnrollmentTokenStatus(ctx context.Context, tokenHash string)
 	return out, nil
 }

+// OutstandingEnrollmentToken is what the recoverable-token list page
+// shows: enough to identify the row (short hash + created/expires)
+// and re-render the install snippet via the regenerate flow, plus
+// the encrypted repo creds blob the caller can decrypt-and-redact for
+// display.
+type OutstandingEnrollmentToken struct {
+	TokenHash    string
+	CreatedAt    time.Time
+	ExpiresAt    time.Time
+	EncRepoCreds string
+	InitialPaths []string
+}
+
+// ListOutstandingEnrollmentTokens returns every still-valid token
+// (un-consumed and not expired). Used by the Add-host page to give
+// operators a way back to the install snippet after they close the
+// /hosts/pending/{token} tab without finishing onboarding.
+func (s *Store) ListOutstandingEnrollmentTokens(ctx context.Context) ([]OutstandingEnrollmentToken, error) {
+	now := time.Now().UTC().Format(time.RFC3339Nano)
+	rows, err := s.db.QueryContext(ctx,
+		`SELECT token_hash, created_at, expires_at, enc_repo_creds, initial_paths
+		 FROM enrollment_tokens
+		 WHERE consumed_at IS NULL AND expires_at > ?
+		 ORDER BY created_at DESC`, now)
+	if err != nil {
+		return nil, fmt.Errorf("store: list outstanding enrollment tokens: %w", err)
+	}
+	defer func() { _ = rows.Close() }()
+	var out []OutstandingEnrollmentToken
+	for rows.Next() {
+		var (
+			hash, created, expires string
+			enc                    sql.NullString
+			pathsJSON              string
+		)
+		if err := rows.Scan(&hash, &created, &expires, &enc, &pathsJSON); err != nil {
+			return nil, fmt.Errorf("store: scan outstanding enrollment token: %w", err)
+		}
+		row := OutstandingEnrollmentToken{TokenHash: hash, InitialPaths: []string{}}
+		if t, err := time.Parse(time.RFC3339Nano, created); err == nil {
+			row.CreatedAt = t
+		}
+		if t, err := time.Parse(time.RFC3339Nano, expires); err == nil {
+			row.ExpiresAt = t
+		}
+		if enc.Valid {
+			row.EncRepoCreds = enc.String
+		}
+		if pathsJSON != "" {
+			_ = json.Unmarshal([]byte(pathsJSON), &row.InitialPaths)
+		}
+		out = append(out, row)
+	}
+	return out, rows.Err()
+}
+
+// DeleteEnrollmentToken removes a token row. Used by the operator-
+// driven revoke flow and by regenerate (which deletes the old hash
+// then mints a fresh one). Idempotent: ErrNotFound on miss.
+func (s *Store) DeleteEnrollmentToken(ctx context.Context, tokenHash string) error {
+	res, err := s.db.ExecContext(ctx,
+		`DELETE FROM enrollment_tokens WHERE token_hash = ?`, tokenHash)
+	if err != nil {
+		return fmt.Errorf("store: delete enrollment token: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return ErrNotFound
+	}
+	return nil
+}
+
 // PurgeExpiredEnrollmentTokens deletes long-expired token rows. Tokens
 // retained for ~24h after expiry so audit traces still resolve them.
 func (s *Store) PurgeExpiredEnrollmentTokens(ctx context.Context) (int64, error) {
@@ -43,7 +43,8 @@ func (s *Store) LookupHostByAgentToken(ctx context.Context, tokenHash string) (*
 			current_job_id, last_backup_at, last_backup_status,
 			repo_size_bytes, snapshot_count, open_alert_count,
 			applied_schedule_version, bandwidth_up_kbps, bandwidth_down_kbps,
-			pre_hook_default, post_hook_default
+			pre_hook_default, post_hook_default,
+			repo_status, repo_status_error
 		 FROM hosts WHERE agent_token_hash = ?`,
 		tokenHash)
 	return scanHost(row)
@@ -57,11 +58,55 @@ func (s *Store) GetHost(ctx context.Context, id string) (*Host, error) {
 			current_job_id, last_backup_at, last_backup_status,
 			repo_size_bytes, snapshot_count, open_alert_count,
 			applied_schedule_version, bandwidth_up_kbps, bandwidth_down_kbps,
-			pre_hook_default, post_hook_default
+			pre_hook_default, post_hook_default,
+			repo_status, repo_status_error
 		 FROM hosts WHERE id = ?`, id)
 	return scanHost(row)
 }

+// SetHostRepoStatus persists the outcome of the latest init / probe
+// attempt against this host's repo. Called by the WS handler on every
+// job.finished of kind=init, and reset to ("unknown", "") by
+// repo-credentials saves so the next probe reflects the new creds.
+//
+// errMsg is stored verbatim (truncate at the call site if you care
+// about row size). Empty for "ready".
+func (s *Store) SetHostRepoStatus(ctx context.Context, hostID, status, errMsg string) error {
+	_, err := s.db.ExecContext(ctx,
+		`UPDATE hosts SET repo_status = ?, repo_status_error = ? WHERE id = ?`,
+		status, errMsg, hostID)
+	if err != nil {
+		return fmt.Errorf("store: set host repo status: %w", err)
+	}
+	return nil
+}
+
+// DeleteHost removes a host row by id. Returns ErrNotFound if no row
+// matched. Foreign-key cascades (declared on every dependent table —
+// schedules, jobs, snapshots, source_groups, host_credentials, etc.)
+// remove the rest. The connection DSN already pins
+// PRAGMA foreign_keys=ON, so the cascade is honoured here without an
+// explicit pragma roundtrip.
+//
+// The host's agent bearer is stored in agent_token_hash on this row,
+// so deleting the row also revokes the agent — a re-installed
+// instance must come back through the normal pending-host accept
+// flow.
+func (s *Store) DeleteHost(ctx context.Context, id string) error {
+	res, err := s.db.ExecContext(ctx, `DELETE FROM hosts WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("store: delete host: %w", err)
+	}
+	n, err := res.RowsAffected()
+	if err != nil {
+		return fmt.Errorf("store: delete host rows: %w", err)
+	}
+	if n == 0 {
+		return ErrNotFound
+	}
+	return nil
+}
+
 // MarkHostHello updates the host row with metadata received in the
 // agent's hello message and flips status to 'online'.
 func (s *Store) MarkHostHello(ctx context.Context, id string, agentVersion, resticVersion string, protoVersion int, when time.Time) error {
@@ -168,7 +213,8 @@ func (s *Store) ListHosts(ctx context.Context) ([]Host, error) {
 			current_job_id, last_backup_at, last_backup_status,
 			repo_size_bytes, snapshot_count, open_alert_count,
 			applied_schedule_version, bandwidth_up_kbps, bandwidth_down_kbps,
-			pre_hook_default, post_hook_default
+			pre_hook_default, post_hook_default,
+			repo_status, repo_status_error
 		 FROM hosts ORDER BY name`)
 	if err != nil {
 		return nil, fmt.Errorf("store: list hosts: %w", err)
@@ -215,7 +261,8 @@ func scanHostRow(s hostScanner) (*Host, error) {
 		&currentJob, &lastBackupAt, &lastBkSt,
 		&h.RepoSizeBytes, &h.SnapshotCount, &h.OpenAlertCount,
 		&h.AppliedScheduleVersion, &bwUp, &bwDown,
-		&preHook, &postHook)
+		&preHook, &postHook,
+		&h.RepoStatus, &h.RepoStatusError)
 	if err != nil {
 		if errors.Is(err, sql.ErrNoRows) {
 			return nil, ErrNotFound
@@ -0,0 +1,98 @@
+package store
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+)
+
+// TestDeleteHostCascades verifies that DeleteHost removes the host
+// row and that every dependent table (schedules, jobs, source groups,
+// host_credentials) is wiped via the FK cascade declared in the
+// migrations. We also verify the agent bearer is no longer resolvable
+// — a re-installed agent must come back through pending-host accept.
+func TestDeleteHostCascades(t *testing.T) {
+	t.Parallel()
+	s := openTestStore(t)
+	ctx := context.Background()
+
+	hostID := makeSchedHost(t, s)
+	gid := makeGroup(t, s, hostID, "default", "01HDELGRP000000000000001")
+
+	// One job, one schedule, one credential row — enough to prove the
+	// cascade reaches every dependent table we care about.
+	if err := s.CreateJob(ctx, Job{
+		ID: "j-del-1", HostID: hostID, Kind: "backup",
+		ActorKind: "system", CreatedAt: time.Now().UTC(),
+	}); err != nil {
+		t.Fatalf("create job: %v", err)
+	}
+
+	sched := &Schedule{
+		ID:             "01HDELSCHED00000000000001",
+		HostID:         hostID,
+		CronExpr:       "0 3 * * *",
+		Enabled:        true,
+		SourceGroupIDs: []string{gid},
+	}
+	if err := s.CreateSchedule(ctx, sched); err != nil {
+		t.Fatalf("create schedule: %v", err)
+	}
+
+	if err := s.SetHostCredentials(ctx, hostID, CredKindRepo, "ciphertext"); err != nil {
+		t.Fatalf("set creds: %v", err)
+	}
+
+	// Sanity: agent bearer resolves before deletion.
+	if _, err := s.LookupHostByAgentToken(ctx, "tokenhash"); err != nil {
+		t.Fatalf("pre-delete bearer lookup: %v", err)
+	}
+
+	if err := s.DeleteHost(ctx, hostID); err != nil {
+		t.Fatalf("DeleteHost: %v", err)
+	}
+
+	if _, err := s.GetHost(ctx, hostID); !errors.Is(err, ErrNotFound) {
+		t.Errorf("GetHost after delete: want ErrNotFound, got %v", err)
+	}
+	if _, err := s.LookupHostByAgentToken(ctx, "tokenhash"); !errors.Is(err, ErrNotFound) {
+		t.Errorf("bearer lookup after delete: want ErrNotFound, got %v", err)
+	}
+
+	// Cascade smoke-tests via raw counts. We don't own a public
+	// "list jobs by host" path that filters by host, so go to the DB
+	// directly with the same connection used by the store helpers.
+	for _, q := range []struct {
+		label string
+		sql   string
+	}{
+		{"schedules", "SELECT count(*) FROM schedules WHERE host_id = ?"},
+		{"jobs", "SELECT count(*) FROM jobs WHERE host_id = ?"},
+		{"source_groups", "SELECT count(*) FROM source_groups WHERE host_id = ?"},
+		{"host_credentials", "SELECT count(*) FROM host_credentials WHERE host_id = ?"},
+		{"schedule_source_groups", "SELECT count(*) FROM schedule_source_groups WHERE schedule_id = ?"},
+	} {
+		var n int
+		key := hostID
+		if q.label == "schedule_source_groups" {
+			key = "01HDELSCHED00000000000001"
+		}
+		if err := s.db.QueryRowContext(ctx, q.sql, key).Scan(&n); err != nil {
+			t.Fatalf("count %s: %v", q.label, err)
+		}
+		if n != 0 {
+			t.Errorf("cascade left %d rows in %s", n, q.label)
+		}
+	}
+}
+
+// TestDeleteHostNotFound: a delete against a missing id surfaces
+// ErrNotFound so the HTTP layer can 404 instead of 200-ing a no-op.
+func TestDeleteHostNotFound(t *testing.T) {
+	t.Parallel()
+	s := openTestStore(t)
+	if err := s.DeleteHost(context.Background(), "01HNOTAHOST00000000000000"); !errors.Is(err, ErrNotFound) {
+		t.Errorf("missing id: want ErrNotFound, got %v", err)
+	}
+}
@@ -0,0 +1,22 @@
+-- 0020_hosts_repo_status.sql
+--
+-- NS-03: surface repo init / probe state on the host row so the
+-- operator sees credential / connectivity failures eagerly rather
+-- than discovering them via a missed scheduled backup.
+--
+-- repo_status:
+--   'unknown'      — no probe outcome yet (default for fresh enrolment
+--                    and for hosts re-binding fresh creds).
+--   'ready'        — last init / probe succeeded; repo is reachable
+--                    with the bound creds.
+--   'init_failed'  — last init / probe failed; repo_status_error has
+--                    the trimmed agent-side error message.
+--
+-- The init-pending intermediate state is intentionally omitted: a job
+-- in flight is already visible on the host detail page via
+-- jobs.status, and bridging both surfaces leads to drift. The host
+-- column reflects the *outcome* of the last probe.
+
+ALTER TABLE hosts ADD COLUMN repo_status TEXT NOT NULL DEFAULT 'unknown'
+  CHECK (repo_status IN ('unknown', 'ready', 'init_failed'));
+ALTER TABLE hosts ADD COLUMN repo_status_error TEXT NOT NULL DEFAULT '';
@@ -90,6 +90,15 @@ type Host struct {
 	// Empty = no default configured.
 	PreHookDefault  string
 	PostHookDefault string
+
+	// RepoStatus tracks the outcome of the last init/probe attempt:
+	// "unknown" (default), "ready", or "init_failed". Set by the WS
+	// handler on every job.finished of kind=init, and reset to
+	// "unknown" by repo-credentials saves so the next dispatch
+	// re-tests the new creds. RepoStatusError carries the trimmed
+	// agent-side message when RepoStatus == "init_failed".
+	RepoStatus      string
+	RepoStatusError string
 }

 // Schedule is now intentionally slim: cron + which groups + enabled.