From 44feb708bcf7f4f474b25c31fd76961d71ee580f Mon Sep 17 00:00:00 2001 From: Steve Cliff Date: Fri, 1 May 2026 14:01:59 +0100 Subject: [PATCH] fix: enrollment FK race + log-when-rejected; runbook fixes from dry-run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The smoke runbook caught a real bug: ConsumeEnrollmentToken was inserting into host_credentials (FK -> hosts) inside the same tx as the token burn, but the host row didn't exist yet — CreateHost runs in the *next* statement. The agent saw a generic 401 with no clue why. Fix: drop the host_credentials insert from ConsumeEnrollmentToken; the HTTP handler now does Consume -> CreateHost -> SetHostCredentials. SetHostCredentials failure is logged loudly but doesn't fail the enrol — operator recovers via PUT /api/hosts/{id}/repo-credentials. Adds slog.Warn lines on both 401 paths in handleAgentEnroll so the underlying cause is visible in server logs (the wire response stays generic to avoid leaking which step failed). Test: TestEnrollmentTransfersRepoCreds rewritten to mirror the new order (consume -> create host -> SetHostCredentials). Runbook (docs/e2e-smoke.md): rest-server moved off 8000 (commonly in use); URLs use trailing slash on the rest path; clarified that secrets_key is minted on first agent start, not at enrol time. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/e2e-smoke.md | 21 ++++++---- internal/server/http/enrollment.go | 17 +++++++- internal/server/http/host_credentials_test.go | 10 +++-- internal/store/enrollment.go | 39 +++++-------------- internal/store/users_test.go | 4 +- 5 files changed, 47 insertions(+), 44 deletions(-) diff --git a/docs/e2e-smoke.md b/docs/e2e-smoke.md index af711ea..ef187d4 100644 --- a/docs/e2e-smoke.md +++ b/docs/e2e-smoke.md @@ -66,7 +66,9 @@ services: environment: - OPTIONS=--no-auth # smoke-test only; real deploys use --append-only + htpasswd ports: - - "127.0.0.1:8000:8000" + # Mapped to 8100 because most dev boxes already have something + # on 8000. Use any free port; just keep the URLs below in sync. + - "127.0.0.1:8100:8000" volumes: - ./rest:/data @@ -118,7 +120,7 @@ ENROLL=$(curl -s -b /tmp/rm-smoke/cookies -X POST http://127.0.0.1:8080/api/enro -H 'content-type: application/json' \ -d '{ "hostname":"smoke-host", - "repo_url":"rest:http://127.0.0.1:8000/smoke", + "repo_url":"rest:http://127.0.0.1:8100/smoke/", "repo_username":"", "repo_password":"smoke-pw" }') @@ -136,7 +138,7 @@ restic itself wants the repo initialised: ```sh RESTIC_PASSWORD=smoke-pw \ -restic -r rest:http://127.0.0.1:8000/smoke init +restic -r rest:http://127.0.0.1:8100/smoke/ init ``` ## 6. Pretend to be a fresh endpoint @@ -169,9 +171,11 @@ HOST_ID=$(grep host_id "$CONFIG" | awk '{print $2}' | tr -d '"') echo "host id: $HOST_ID" ``` -After enrolment, `agent.yaml` should contain `secrets_key:` (a -base64 32-byte key) and `host_id:` (a ULID). It should **not** -contain `repo_url:` or `repo_password:`. +After enrolment, `agent.yaml` should contain `host_id:` (a ULID), +`agent_token:`, and `server_url:`. It will **not** contain +`secrets_key:` yet — that's minted on the first non-enroll start +of the agent (next step). It should **not** contain `repo_url:` +or `repo_password:` (those never appear in plaintext on disk). ```sh cat "$CONFIG" @@ -196,7 +200,8 @@ ws agent: repo credentials updated via config.update That last line confirms slice 1 + 2 of P1-32/33: the server pushed the encrypted creds, the agent decrypted, persisted to `secrets.enc`, and is now ready to back up. `secrets.enc` should now exist and be -0600. +0600. `agent.yaml` should now also contain a freshly-minted +`secrets_key:` (base64-encoded 32 bytes). ```sh ls -l /tmp/rm-smoke/agent/var-lib/secrets.enc @@ -235,7 +240,7 @@ curl -s -b /tmp/rm-smoke/cookies \ "http://127.0.0.1:8080/api/hosts/$HOST_ID/repo-credentials" | jq ``` -Expect `{"repo_url":"rest:http://127.0.0.1:8000/smoke","has_password":true}`. +Expect `{"repo_url":"rest:http://127.0.0.1:8100/smoke/","has_password":true}`. The password is never returned over this endpoint. ## 11. Edit creds + verify push-on-update diff --git a/internal/server/http/enrollment.go b/internal/server/http/enrollment.go index 6755a6c..0998565 100644 --- a/internal/server/http/enrollment.go +++ b/internal/server/http/enrollment.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "log/slog" stdhttp "net/http" "strings" "time" @@ -98,12 +99,14 @@ func (s *Server) handleAgentEnroll(w stdhttp.ResponseWriter, r *stdhttp.Request) // the token, which is about to disappear). encForHost, err := s.rebindTokenCreds(r.Context(), tokHash, hostID) if err != nil { + slog.Warn("enrollment: rebind token creds failed", "err", err) writeJSONError(w, stdhttp.StatusUnauthorized, "invalid_token", "token unknown, expired, or already used") return } - if err := s.deps.Store.ConsumeEnrollmentToken(r.Context(), tokHash, hostID, encForHost); err != nil { + if err := s.deps.Store.ConsumeEnrollmentToken(r.Context(), tokHash, hostID); err != nil { + slog.Warn("enrollment: consume token failed", "err", err) writeJSONError(w, stdhttp.StatusUnauthorized, "invalid_token", "token unknown, expired, or already used") return @@ -131,6 +134,18 @@ func (s *Server) handleAgentEnroll(w stdhttp.ResponseWriter, r *stdhttp.Request) return } + // Promote the encrypted repo creds onto the freshly-created host + // row. If this fails for any reason we log loudly but still + // return the bearer — the operator recovers via PUT + // /api/hosts/{id}/repo-credentials. Failing the whole enrolment + // here would leave a half-burned token + an orphan host. + if encForHost != "" { + if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, encForHost); err != nil { + slog.Error("enrollment: set host credentials failed", + "host_id", hostID, "err", err) + } + } + _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ ID: ulid.Make().String(), Actor: "system", diff --git a/internal/server/http/host_credentials_test.go b/internal/server/http/host_credentials_test.go index f435a0e..750dc69 100644 --- a/internal/server/http/host_credentials_test.go +++ b/internal/server/http/host_credentials_test.go @@ -46,14 +46,18 @@ func TestEnrollmentTransfersRepoCreds(t *testing.T) { t.Errorf("rebind should change ciphertext (additional-data differs); got identical") } - // Need a host row for the FK. + // Burn the token, then create the host row, then promote — same + // order the HTTP handler runs. + if err := st.ConsumeEnrollmentToken(ctx, tokHash, hostID); err != nil { + t.Fatalf("consume: %v", err) + } if _, err := st.DB().Exec( `INSERT INTO hosts (id, name, os, arch, enrolled_at) VALUES (?,?,?,?,?)`, hostID, "host42", "linux", "amd64", "2026-01-01T00:00:00Z"); err != nil { t.Fatalf("insert host: %v", err) } - if err := st.ConsumeEnrollmentToken(ctx, tokHash, hostID, encForHost); err != nil { - t.Fatalf("consume: %v", err) + if err := st.SetHostCredentials(ctx, hostID, encForHost); err != nil { + t.Fatalf("set host credentials: %v", err) } // host_credentials row should now hold the host-bound ciphertext. diff --git a/internal/store/enrollment.go b/internal/store/enrollment.go index edf8383..e2f6843 100644 --- a/internal/store/enrollment.go +++ b/internal/store/enrollment.go @@ -37,22 +37,17 @@ func (s *Store) CreateEnrollmentToken(ctx context.Context, tokenHash string, ttl } // ConsumeEnrollmentToken atomically validates a token (must exist, -// not be consumed, not be expired), marks it consumed by hostID, and -// — if the token carries encrypted repo creds — promotes them to a -// host_credentials row in the same tx. The encrypted blob is -// re-encrypted by the caller with host_id as additional data; we -// don't crack it open here. -// +// not be consumed, not be expired) and marks it consumed by hostID. // Returns ErrNotFound on any failure. -func (s *Store) ConsumeEnrollmentToken(ctx context.Context, tokenHash, hostID, encRepoCredsForHost string) error { +// +// The associated repo creds (if any) are promoted into +// host_credentials by the caller via SetHostCredentials *after* the +// host row exists — host_credentials has a FK to hosts that would +// otherwise fire here, since the host is created by a separate +// statement immediately after this returns. +func (s *Store) ConsumeEnrollmentToken(ctx context.Context, tokenHash, hostID string) error { now := time.Now().UTC().Format(time.RFC3339Nano) - tx, err := s.db.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("store: consume enrollment token: begin: %w", err) - } - defer func() { _ = tx.Rollback() }() - - res, err := tx.ExecContext(ctx, + res, err := s.db.ExecContext(ctx, `UPDATE enrollment_tokens SET consumed_at = ?, consumed_host = ? WHERE token_hash = ? AND consumed_at IS NULL AND expires_at > ?`, @@ -64,22 +59,6 @@ func (s *Store) ConsumeEnrollmentToken(ctx context.Context, tokenHash, hostID, e if n == 0 { return ErrNotFound } - - if encRepoCredsForHost != "" { - if _, err := tx.ExecContext(ctx, - `INSERT INTO host_credentials (host_id, enc_repo_creds, updated_at) - VALUES (?, ?, ?) - ON CONFLICT(host_id) DO UPDATE SET - enc_repo_creds = excluded.enc_repo_creds, - updated_at = excluded.updated_at`, - hostID, encRepoCredsForHost, now); err != nil { - return fmt.Errorf("store: promote host credentials: %w", err) - } - } - - if err := tx.Commit(); err != nil { - return fmt.Errorf("store: consume enrollment token: commit: %w", err) - } return nil } diff --git a/internal/store/users_test.go b/internal/store/users_test.go index 35a1aa0..c065aee 100644 --- a/internal/store/users_test.go +++ b/internal/store/users_test.go @@ -148,11 +148,11 @@ func TestEnrollmentTokenSingleUse(t *testing.T) { t.Fatalf("insert host: %v", err) } - if err := s.ConsumeEnrollmentToken(ctx, hash, "h1", ""); err != nil { + if err := s.ConsumeEnrollmentToken(ctx, hash, "h1"); err != nil { t.Fatalf("consume: %v", err) } // Second consume must fail — the whole point of one-time tokens. - if err := s.ConsumeEnrollmentToken(ctx, hash, "h1", ""); !errors.Is(err, ErrNotFound) { + if err := s.ConsumeEnrollmentToken(ctx, hash, "h1"); !errors.Is(err, ErrNotFound) { t.Errorf("re-consume: want ErrNotFound, got %v", err) } }