server: enqueue pending_runs when scheduled-job dispatch fails
When dispatchBackupForGroup's conn.Send errors, queue a pending_runs row (attempt=1, next_attempt_at = now + group.RetryBackoffSeconds) instead of silently dropping the fire. The orphaned queued job row is left behind for forensic visibility — the drainer will create a fresh job row on its retry. Also adds Store.ListPendingRunsForHost — the on-reconnect drain walks every row for the host, regardless of due-ness, since the host being back makes 'due' irrelevant.
This commit is contained in:
@@ -206,8 +206,30 @@ func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, host
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancel()
|
||||
if err := conn.Send(sendCtx, env); err != nil {
|
||||
slog.Warn("schedule.fire: send command.run",
|
||||
"host_id", hostID, "schedule_id", scheduleID, "err", err)
|
||||
slog.Warn("schedule.fire: send command.run failed, queueing for retry",
|
||||
"host_id", hostID, "schedule_id", scheduleID, "group", g.Name, "err", err)
|
||||
backoff := time.Duration(g.RetryBackoffSeconds) * time.Second
|
||||
if backoff <= 0 {
|
||||
backoff = 60 * time.Second
|
||||
}
|
||||
if enqueueErr := s.deps.Store.EnqueuePendingRun(ctx, &store.PendingRun{
|
||||
ID: ulid.Make().String(),
|
||||
ScheduleID: scheduleID,
|
||||
SourceGroupID: g.ID,
|
||||
HostID: hostID,
|
||||
Attempt: 1,
|
||||
NextAttemptAt: time.Now().UTC().Add(backoff),
|
||||
ScheduledAt: scheduledAt,
|
||||
LastError: err.Error(),
|
||||
}); enqueueErr != nil {
|
||||
slog.Warn("schedule.fire: enqueue pending run failed",
|
||||
"host_id", hostID, "schedule_id", scheduleID, "group", g.Name, "err", enqueueErr)
|
||||
}
|
||||
// The job row was already persisted earlier in this function — leave
|
||||
// it in `queued` status. The drainer will re-dispatch (creating a
|
||||
// new job row) and the orphaned queued row stays for forensic
|
||||
// visibility. Don't delete it: the audit trail still wants to know
|
||||
// "we tried and the wire was wedged."
|
||||
return ""
|
||||
}
|
||||
_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
|
||||
|
||||
Reference in New Issue
Block a user