Merge pull request 'De-flake TestDrainPendingSerializesPerHost (CI stability)' (#33 ) from fix-flaky-server-http-tests into main

Reviewed-on: #33
test(pending-drain): de-flake TestDrainPendingSerializesPerHost
2026-06-16 15:44:47 +01:00 · 2026-06-16 13:29:47 +01:00 · 2026-06-16 07:32:00 +01:00 · 2026-06-15 23:07:43 +01:00 · 2026-06-15 23:01:03 +01:00 · 2026-06-15 23:00:56 +01:00
4 changed files with 85 additions and 19 deletions
@@ -49,3 +49,6 @@ coverage.html
 # Local-only planning / scratch — never committed.
 /ask.md
 /docs/superpowers/
 # Claude Code agent worktrees (transient, harness-created).
 /.claude/worktrees/
@@ -6,6 +6,44 @@ and the project follows [Semantic Versioning](https://semver.org/).
 ## [Unreleased]
 ## [1.1.0] - 2026-06-15
 ### Added
 - **Always-On vs intermittent host mode.** A host can now be marked as
  not always-on — for laptops/workstations that legitimately sleep,
  travel, or shut down outside hours. An intermittent host no longer
  raises "agent offline" alerts when it disappears; instead it shows a
  calm "asleep" state in the UI ("asleep · last seen … · will catch up
  on return") and is covered by a longer-horizon staleness alert (raised
  only when it has an enabled schedule and no successful backup in 7
  days). When such a host reconnects, the server waits a short settle
  window and then automatically dispatches any scheduled backup whose
  window elapsed while it was asleep. Toggle per host from the host
  detail page (operator-band, audited as `host.mode_updated`). New and
  existing hosts default to always-on, so current fleets are unaffected.
 ### Changed
 - Host-detail header redesign: tags and presence are grouped into
  labelled, boxed pills with click-to-edit; presence shows a `24x7` /
  `Free` chip; the agent "out of date" indicator is simplified (the full
  version detail remains in the Agent-update panel and on hover).
 - Relative timestamps ("2h ago") now tick client-side, so a tab left
  open no longer shows a stale value as wall-clock time moves on.
 - Release and CI container images are now published to and pulled from
  the zot OCI registry (`docker.dcglab.co.uk`).
 ## [1.0.1] - 2026-05-09
 ### Fixed
 - Build version is now single-sourced from `internal/version`, and the
  server Dockerfile's ldflags were corrected so docker-built binaries
  report their real version. Previously `internal/version.Version` stayed
  at its "dev" default in docker images, which made every host look
  permanently out-of-date to the update logic.
 ## [1.0.0] - 2026-05-09
 First tagged release. Six development phases brought the project from
@@ -512,11 +512,27 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
 	// Connect the agent so DrainPending can dispatch.
 	c := agentDial(t, srv, ts, hostID, token)
 	sendHello(t, c, "serialise-host")
-	// Drain the on-hello goroutine's pass first (no pending rows yet),
+	// Wait for the on-hello push to settle.
 	// then wait for the schedule.set so the connection is fully settled.
 	_ = drainUntil(t, c, api.MsgScheduleSet)
-	// Insert 5 pending rows now that the on-hello drain has already run.
+	// A real agent is always in a read loop. Keep this test client
 	// reading in the background for the rest of the test: without an
 	// active reader the server-side conn can be dropped under parallel
 	// load, which unregisters it from the hub and makes DrainPending
 	// no-op (conn == nil) — the historical source of this test's
 	// flakiness (it would observe 0 or a partial drain). The reader also
 	// consumes the command.run envelopes our drains emit.
 	readerCtx, stopReader := context.WithCancel(context.Background())
 	defer stopReader()
 	go func() {
 		for {
 			if _, _, err := c.Read(readerCtx); err != nil {
 				return
 			}
 		}
 	}()
 	// Insert 5 due pending rows.
 	now := time.Now().UTC()
 	for i := range 5 {
 		pid := ulid.Make().String()
@@ -533,7 +549,8 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
 		}
 	}
-	// Spawn 10 goroutines all calling DrainPending concurrently.
+	// Fire 10 concurrent DrainPending calls. The per-host mutex must
 	// ensure each row is dispatched at most once (no double-dispatch).
 	var wg sync.WaitGroup
 	for range 10 {
 		wg.Add(1)
@@ -544,24 +561,26 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
 	}
 	wg.Wait()
-	// Drain any envelopes the agent received so we don't block below.
+	// Drain to completion. The fire-and-forget on-hello DrainPending
-	// We read with short timeouts and stop when the connection goes quiet.
+	// shares the same per-host mutex and can hold it during the burst,
-	drainDeadline := time.Now().Add(500 * time.Millisecond)
+	// leaving rows for a later pass — exactly how production drains
-	for time.Now().Before(drainDeadline) {
+	// (repeatedly, via the 30s tick / on reconnect). Re-drain until the
-		ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	// queue is empty; because every drain is still serialised, each row
-		_, _, err := c.Read(ctx)
+	// is dispatched at most once, so the exactly-5 job count below proves
-		cancel()
+	// there was no double-dispatch.
-		if err != nil {
+	deadline := time.Now().Add(5 * time.Second)
-			break
+	for countPendingForHost(t, st, hostID) > 0 && time.Now().Before(deadline) {
-		}
+		srv.DrainPending(context.Background(), hostID)
 		time.Sleep(10 * time.Millisecond)
 	}
-	// All 5 pending rows must be gone.
+	// All 5 pending rows must be drained.
 	if n := countPendingForHost(t, st, hostID); n != 0 {
-		t.Errorf("pending rows after concurrent drain: got %d, want 0", n)
+		t.Errorf("pending rows after drain-to-completion: got %d, want 0", n)
 	}
-	// Exactly 5 backup job rows (one per pending row), not 10+ from a race.
+	// Exactly 5 backup job rows (one per pending row) — never more, which
 	// would mean the per-host mutex failed to prevent double-dispatch.
 	var n int
 	_ = st.DB().QueryRow(
 		`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup' AND actor_kind = 'schedule'`,
@@ -49,8 +49,14 @@ func TestDashboard_HostRowSparklineRendersWithHistory(t *testing.T) {
 	hostID := makeHost(t, st, "h-spark")
 	ctx := context.Background()
-	// Two history points → polyline must render.
+	// Two history points → polyline must render. Use dates relative to
-	for i, day := range []string{"2026-05-05", "2026-05-06"} {
+	// now so the points always fall inside the dashboard's rolling
 	// 30-day window (ui_handlers.go: since = now-30d); hard-coded dates
 	// silently age out of the window and break this test over time.
 	for i, day := range []string{
 		time.Now().UTC().AddDate(0, 0, -2).Format("2006-01-02"),
 		time.Now().UTC().AddDate(0, 0, -1).Format("2006-01-02"),
 	} {
 		v := int64(100 + i*50)
 		if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
 			store.HostRepoStats{TotalSizeBytes: &v}, time.Now().UTC()); err != nil {
Author	SHA1	Message	Date
steve	6c6b962e24	Merge pull request 'De-flake TestDrainPendingSerializesPerHost (CI stability)' (#33 ) from fix-flaky-server-http-tests into main Reviewed-on: #33	2026-06-16 15:44:47 +01:00
steve	e64075d5d7	test(pending-drain): de-flake TestDrainPendingSerializesPerHost CI / Test (store) (pull_request) Successful in 8s Details CI / Test (rest) (pull_request) Successful in 12s Details CI / Build (windows/amd64) (pull_request) Successful in 15s Details CI / Lint (pull_request) Successful in 19s Details CI / Build (linux/amd64) (pull_request) Successful in 12s Details CI / Build (linux/arm64) (pull_request) Successful in 44s Details CI / Test (server-http) (pull_request) Successful in 2m55s Details e2e / Playwright vs docker-compose (pull_request) Successful in 2m45s Details Keep the test WS client actively reading (a real agent always is) so the server-side conn stays registered under parallel load, and drain to completion via condition polling instead of asserting one-shot completeness. The conn could be dropped/unregistered under CI load, making DrainPending correctly no-op (conn==nil) and the test observe a partial/empty drain. -race confirms no production data race; the exactly-5-jobs assertion (proving the per-host mutex blocks double-dispatch) is unchanged. Verified: 0 failures over 25 loaded runs + 4 -race iterations.	2026-06-16 13:29:47 +01:00
steve	0f5110f3d9	Merge pull request 'Release v1.1.0 — CHANGELOG' (#32 ) from release-v1.1.0 into main Release / Build + push image (push) Successful in 3m39s Details	2026-06-16 07:32:00 +01:00
steve	0fbacf9f98	docs(changelog): v1.1.0 (always-on host mode) + retroactive v1.0.1 CI / Test (rest) (pull_request) Successful in 10s Details CI / Lint (pull_request) Successful in 16s Details CI / Build (windows/amd64) (pull_request) Successful in 11s Details CI / Build (linux/amd64) (pull_request) Successful in 12s Details CI / Build (linux/arm64) (pull_request) Successful in 11s Details CI / Test (store) (pull_request) Successful in 1m5s Details e2e / Playwright vs docker-compose (pull_request) Failing after 9s Details CI / Test (server-http) (pull_request) Failing after 2m43s Details	2026-06-15 23:07:43 +01:00
steve	d8fd4110b0	Merge pull request 'Always-On vs intermittent host mode (laptops): suppress offline noise, catch up missed backups' (#31 ) from feat-laptop-host-mode into main Reviewed-on: #31	2026-06-15 23:01:03 +01:00
steve	e17932d797	Merge branch 'main' into feat-laptop-host-mode CI / Test (rest) (pull_request) Successful in 1m6s Details CI / Lint (pull_request) Successful in 18s Details CI / Build (windows/amd64) (pull_request) Successful in 12s Details CI / Build (linux/amd64) (pull_request) Successful in 14s Details CI / Test (store) (pull_request) Successful in 1m8s Details CI / Build (linux/arm64) (pull_request) Successful in 11s Details e2e / Playwright vs docker-compose (pull_request) Failing after 10s Details CI / Test (server-http) (pull_request) Successful in 2m52s Details	2026-06-15 23:00:56 +01:00
steve	a30f824a3c	Merge pull request 'Tidy: fix stale-dated sparkline test + gitignore agent worktrees' (#30 ) from tidy-sparkline-test-and-gitignore into main Reviewed-on: #30	2026-06-15 22:32:53 +01:00
steve	239d55b65b	test(dashboard): use relative dates so sparkline test doesn't age out of the 30-day window CI / Test (store) (pull_request) Successful in 8s Details CI / Test (rest) (pull_request) Successful in 45s Details CI / Lint (pull_request) Successful in 33s Details CI / Build (windows/amd64) (pull_request) Successful in 44s Details CI / Build (linux/amd64) (pull_request) Successful in 47s Details CI / Build (linux/arm64) (pull_request) Successful in 45s Details CI / Test (server-http) (pull_request) Successful in 2m26s Details e2e / Playwright vs docker-compose (pull_request) Successful in 2m50s Details	2026-06-15 22:15:07 +01:00
steve	74e5b75380	chore: gitignore .claude/worktrees (transient agent worktrees)	2026-06-15 22:14:36 +01:00