fix: enrollment FK race + log-when-rejected; runbook fixes from dry-run

The smoke runbook caught a real bug: ConsumeEnrollmentToken was
inserting into host_credentials (FK -> hosts) inside the same tx as
the token burn, but the host row didn't exist yet — CreateHost
runs in the *next* statement. The agent saw a generic 401 with no
clue why.

Fix: drop the host_credentials insert from ConsumeEnrollmentToken;
the HTTP handler now does Consume -> CreateHost ->
SetHostCredentials. SetHostCredentials failure is logged loudly
but doesn't fail the enrol — operator recovers via PUT
/api/hosts/{id}/repo-credentials.

Adds slog.Warn lines on both 401 paths in handleAgentEnroll so the
underlying cause is visible in server logs (the wire response stays
generic to avoid leaking which step failed).

Test: TestEnrollmentTransfersRepoCreds rewritten to mirror the new
order (consume -> create host -> SetHostCredentials).

Runbook (docs/e2e-smoke.md): rest-server moved off 8000 (commonly
in use); URLs use trailing slash on the rest path; clarified that
secrets_key is minted on first agent start, not at enrol time.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-01 14:01:59 +01:00
parent 6cfbdfc7ab
commit 44feb708bc
5 changed files with 47 additions and 44 deletions
+16 -1
View File
@@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"log/slog"
stdhttp "net/http"
"strings"
"time"
@@ -98,12 +99,14 @@ func (s *Server) handleAgentEnroll(w stdhttp.ResponseWriter, r *stdhttp.Request)
// the token, which is about to disappear).
encForHost, err := s.rebindTokenCreds(r.Context(), tokHash, hostID)
if err != nil {
slog.Warn("enrollment: rebind token creds failed", "err", err)
writeJSONError(w, stdhttp.StatusUnauthorized, "invalid_token",
"token unknown, expired, or already used")
return
}
if err := s.deps.Store.ConsumeEnrollmentToken(r.Context(), tokHash, hostID, encForHost); err != nil {
if err := s.deps.Store.ConsumeEnrollmentToken(r.Context(), tokHash, hostID); err != nil {
slog.Warn("enrollment: consume token failed", "err", err)
writeJSONError(w, stdhttp.StatusUnauthorized, "invalid_token",
"token unknown, expired, or already used")
return
@@ -131,6 +134,18 @@ func (s *Server) handleAgentEnroll(w stdhttp.ResponseWriter, r *stdhttp.Request)
return
}
// Promote the encrypted repo creds onto the freshly-created host
// row. If this fails for any reason we log loudly but still
// return the bearer — the operator recovers via PUT
// /api/hosts/{id}/repo-credentials. Failing the whole enrolment
// here would leave a half-burned token + an orphan host.
if encForHost != "" {
if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, encForHost); err != nil {
slog.Error("enrollment: set host credentials failed",
"host_id", hostID, "err", err)
}
}
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
ID: ulid.Make().String(),
Actor: "system",
@@ -46,14 +46,18 @@ func TestEnrollmentTransfersRepoCreds(t *testing.T) {
t.Errorf("rebind should change ciphertext (additional-data differs); got identical")
}
// Need a host row for the FK.
// Burn the token, then create the host row, then promote — same
// order the HTTP handler runs.
if err := st.ConsumeEnrollmentToken(ctx, tokHash, hostID); err != nil {
t.Fatalf("consume: %v", err)
}
if _, err := st.DB().Exec(
`INSERT INTO hosts (id, name, os, arch, enrolled_at) VALUES (?,?,?,?,?)`,
hostID, "host42", "linux", "amd64", "2026-01-01T00:00:00Z"); err != nil {
t.Fatalf("insert host: %v", err)
}
if err := st.ConsumeEnrollmentToken(ctx, tokHash, hostID, encForHost); err != nil {
t.Fatalf("consume: %v", err)
if err := st.SetHostCredentials(ctx, hostID, encForHost); err != nil {
t.Fatalf("set host credentials: %v", err)
}
// host_credentials row should now hold the host-bound ciphertext.