server: drainer abandons only on ErrNotFound, not transient errors

GetSourceGroup errors in drainOne now gate on errors.Is(err, store.ErrNotFound) before calling abandonPending, mirroring the existing GetSchedule pattern. Transient errors (SQLITE_BUSY, context cancellation) now log a warning and return without deleting the row. Add regression test TestDrainPendingDropsRowsForGoneSourceGroup confirming the ErrNotFound path still abandons correctly. Also add a comment above the backoff-doubling loop explaining the progression.
2026-05-04 00:07:33 +01:00
parent d6dcdd5ec4
commit e0eae0a96f
2 changed files with 66 additions and 1 deletions
@@ -74,7 +74,12 @@ func (s *Server) drainOne(ctx context.Context, conn *ws.Conn, p store.PendingRun
 	}
 	g, err := s.deps.Store.GetSourceGroup(ctx, p.HostID, p.SourceGroupID)
 	if err != nil {
-		s.abandonPending(ctx, p, "source group gone")
+		if errors.Is(err, store.ErrNotFound) {
+			s.abandonPending(ctx, p, "source group gone")
+		} else {
+			slog.Warn("drain pending: load source group",
+				"host_id", p.HostID, "group_id", p.SourceGroupID, "err", err)
+		}
 		return
 	}
 	if g.RetryMax > 0 && p.Attempt >= g.RetryMax {
@@ -90,6 +95,10 @@ func (s *Server) drainOne(ctx context.Context, conn *ws.Conn, p store.PendingRun
 	jobID, _ := s.dispatchBackupForGroupCore(ctx, conn, p.HostID, p.ScheduleID, g, p.ScheduledAt)
 	if jobID == "" {
 		// Send failed again. Bump attempt with exponential backoff.
+		// Exponential backoff doubles immediately on the first drain
+		// retry: enqueue at base, attempt=1 → drain → 2*base, attempt=2 →
+		// drain → 4*base, etc. Capped at pendingDrainBackoffMax. With
+		// defaults (60s base, retry_max=3) the schedule is 60→120→240s.
 		baseBackoff := time.Duration(g.RetryBackoffSeconds) * time.Second
 		if baseBackoff <= 0 {
 			baseBackoff = 60 * time.Second