Merge pull request 'De-flake TestDrainPendingSerializesPerHost (CI stability)' (#33) from fix-flaky-server-http-tests into main
Reviewed-on: #33
This commit was merged in pull request #33.
This commit is contained in:
@@ -512,11 +512,27 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
|
||||
// Connect the agent so DrainPending can dispatch.
|
||||
c := agentDial(t, srv, ts, hostID, token)
|
||||
sendHello(t, c, "serialise-host")
|
||||
// Drain the on-hello goroutine's pass first (no pending rows yet),
|
||||
// then wait for the schedule.set so the connection is fully settled.
|
||||
// Wait for the on-hello push to settle.
|
||||
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||
|
||||
// Insert 5 pending rows now that the on-hello drain has already run.
|
||||
// A real agent is always in a read loop. Keep this test client
|
||||
// reading in the background for the rest of the test: without an
|
||||
// active reader the server-side conn can be dropped under parallel
|
||||
// load, which unregisters it from the hub and makes DrainPending
|
||||
// no-op (conn == nil) — the historical source of this test's
|
||||
// flakiness (it would observe 0 or a partial drain). The reader also
|
||||
// consumes the command.run envelopes our drains emit.
|
||||
readerCtx, stopReader := context.WithCancel(context.Background())
|
||||
defer stopReader()
|
||||
go func() {
|
||||
for {
|
||||
if _, _, err := c.Read(readerCtx); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Insert 5 due pending rows.
|
||||
now := time.Now().UTC()
|
||||
for i := range 5 {
|
||||
pid := ulid.Make().String()
|
||||
@@ -533,7 +549,8 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// Spawn 10 goroutines all calling DrainPending concurrently.
|
||||
// Fire 10 concurrent DrainPending calls. The per-host mutex must
|
||||
// ensure each row is dispatched at most once (no double-dispatch).
|
||||
var wg sync.WaitGroup
|
||||
for range 10 {
|
||||
wg.Add(1)
|
||||
@@ -544,24 +561,26 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Drain any envelopes the agent received so we don't block below.
|
||||
// We read with short timeouts and stop when the connection goes quiet.
|
||||
drainDeadline := time.Now().Add(500 * time.Millisecond)
|
||||
for time.Now().Before(drainDeadline) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
_, _, err := c.Read(ctx)
|
||||
cancel()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
// Drain to completion. The fire-and-forget on-hello DrainPending
|
||||
// shares the same per-host mutex and can hold it during the burst,
|
||||
// leaving rows for a later pass — exactly how production drains
|
||||
// (repeatedly, via the 30s tick / on reconnect). Re-drain until the
|
||||
// queue is empty; because every drain is still serialised, each row
|
||||
// is dispatched at most once, so the exactly-5 job count below proves
|
||||
// there was no double-dispatch.
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for countPendingForHost(t, st, hostID) > 0 && time.Now().Before(deadline) {
|
||||
srv.DrainPending(context.Background(), hostID)
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
|
||||
// All 5 pending rows must be gone.
|
||||
// All 5 pending rows must be drained.
|
||||
if n := countPendingForHost(t, st, hostID); n != 0 {
|
||||
t.Errorf("pending rows after concurrent drain: got %d, want 0", n)
|
||||
t.Errorf("pending rows after drain-to-completion: got %d, want 0", n)
|
||||
}
|
||||
|
||||
// Exactly 5 backup job rows (one per pending row), not 10+ from a race.
|
||||
// Exactly 5 backup job rows (one per pending row) — never more, which
|
||||
// would mean the per-host mutex failed to prevent double-dispatch.
|
||||
var n int
|
||||
_ = st.DB().QueryRow(
|
||||
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup' AND actor_kind = 'schedule'`,
|
||||
|
||||
Reference in New Issue
Block a user