From 440ac5cc185b32a92d2070cf89c6b6a1047d3905 Mon Sep 17 00:00:00 2001 From: Steve Cliff Date: Mon, 4 May 2026 00:29:52 +0100 Subject: [PATCH] store: LatestJobByKind includes in-flight jobs (avoid maintenance double-fire) Widen the SQL query to consider all statuses (queued, running, succeeded, failed, cancelled) rather than terminal-only. An in-flight prune that outlasts the 60s tick interval previously produced ErrNotFound, causing the ticker to anchor at now-24h and fire a second prune concurrently with the first. Update the doc comment and test: remove the "queued job filtered out" case, add assertions that a running job and a queued job are each returned as the latest. --- internal/store/jobs.go | 10 ++++------ internal/store/jobs_test.go | 33 +++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/internal/store/jobs.go b/internal/store/jobs.go index 1cec113..633fcc6 100644 --- a/internal/store/jobs.go +++ b/internal/store/jobs.go @@ -193,20 +193,18 @@ func (s *Store) GetJob(ctx context.Context, id string) (*Job, error) { return &j, nil } -// LatestJobByKind returns the most recent terminal job (status in -// 'succeeded','failed','cancelled' — UK spelling matches the wire/DB -// literal, see api.JobCancelled) of the given kind for the host, or +// LatestJobByKind returns the most recent job (any status, including +// queued and running) of the given kind for the host, or // (nil, ErrNotFound) if no such job exists. Used by the maintenance // ticker to compute "last fire" anchors for the cron-due check; -// queued and running jobs are excluded so an in-flight run doesn't -// suppress its own cron tick from firing. //nolint:misspell // wire format +// in-flight jobs MUST be considered or a long-running prune (>60s) +// would re-fire on the next tick while the first is still running. func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job, error) { row := s.db.QueryRowContext(ctx, `SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id, started_at, finished_at, exit_code, stats, error, created_at FROM jobs WHERE host_id = ? AND kind = ? - AND status IN ('succeeded','failed','cancelled') ORDER BY created_at DESC LIMIT 1`, hostID, kind) var ( diff --git a/internal/store/jobs_test.go b/internal/store/jobs_test.go index 46af279..9339a66 100644 --- a/internal/store/jobs_test.go +++ b/internal/store/jobs_test.go @@ -49,20 +49,41 @@ func TestLatestJobByKind(t *testing.T) { t.Errorf("want j-new, got %q", got.ID) } - // A queued job should be ignored — terminal-status filter. - queuedAt := time.Now().UTC() + // An in-flight running job must be returned — long-prune-suppresses-tick + // scenario: if a prune runs >60s the next tick must not re-fire it. + runningAt := time.Now().UTC() + if err := s.CreateJob(ctx, Job{ + ID: "j-running", HostID: hostID, Kind: "forget", + ActorKind: "system", CreatedAt: runningAt, + }); err != nil { + t.Fatalf("create running: %v", err) + } + if err := s.MarkJobStarted(ctx, "j-running", runningAt); err != nil { + t.Fatalf("mark started: %v", err) + } + got2, err := s.LatestJobByKind(ctx, hostID, "forget") + if err != nil { + t.Fatalf("LatestJobByKind 2: %v", err) + } + if got2.ID != "j-running" { + t.Errorf("in-flight running job must be returned; want j-running, got %q", got2.ID) + } + + // A queued (not-yet-started) job is also returned (it is newer than + // j-running because CreatedAt is later). + queuedAt := runningAt.Add(time.Millisecond) if err := s.CreateJob(ctx, Job{ ID: "j-queued", HostID: hostID, Kind: "forget", ActorKind: "system", CreatedAt: queuedAt, }); err != nil { t.Fatalf("create queued: %v", err) } - got2, err := s.LatestJobByKind(ctx, hostID, "forget") + got3, err := s.LatestJobByKind(ctx, hostID, "forget") if err != nil { - t.Fatalf("LatestJobByKind 2: %v", err) + t.Fatalf("LatestJobByKind 3: %v", err) } - if got2.ID != "j-new" { - t.Errorf("queued job should be ignored; want j-new, got %q", got2.ID) + if got3.ID != "j-queued" { + t.Errorf("queued job must be returned as newest; want j-queued, got %q", got3.ID) } // Different kind → ErrNotFound.