restic-manager/internal/server/http/pending_drain.go

// pending_drain.go — drains pending_runs rows that are due (or, on
// agent reconnect, every row for that host).
//
// Two trigger paths:
//  1. The 30s tick in cmd/server (DrainAllDue) — sweeps every host
//     with rows whose next_attempt_at <= now.
//  2. onAgentHello (DrainPending(hostID)) — when a host comes back,
//     walk all of its pending rows synchronously so the operator
//     sees the queue drain promptly.
package http

import (
	"context"
	"errors"
	"log/slog"
	"time"

	"github.com/oklog/ulid/v2"

	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)

const (
	pendingDrainBatchLimit = 100
	pendingDrainBackoffMax = 30 * time.Minute
)

// DrainPending re-dispatches every pending_runs row for hostID. The
// host must already be connected (caller's responsibility — typically
// onAgentHello). Each row's source group + schedule are loaded; if
// either is gone the row is dropped (audit-logged as abandoned). If
// the row's attempt count meets/exceeds the group's retry_max, the
// row is dropped (audit-logged as abandoned). Otherwise we attempt
// dispatch; success deletes the row, failure bumps the attempt and
// reschedules with exponential backoff.
func (s *Server) DrainPending(ctx context.Context, hostID string) {
	runs, err := s.deps.Store.ListPendingRunsForHost(ctx, hostID)
	if err != nil {
		slog.Warn("drain pending: list", "host_id", hostID, "err", err)
		return
	}
	if len(runs) == 0 {
		return
	}
	conn := s.deps.Hub.Conn(hostID)
	if conn == nil {
		// Host went offline between the connectedness check and now.
		// Skip — next tick or next reconnect will retry.
		return
	}
	for _, p := range runs {
		s.drainOne(ctx, conn, p)
	}
}

// drainOne handles a single pending row. Refactored out so DrainPending
// reads cleanly. Side-effects: delete, bump, audit, dispatch — all
// per-row.
func (s *Server) drainOne(ctx context.Context, conn *ws.Conn, p store.PendingRun) {
	sc, err := s.deps.Store.GetSchedule(ctx, p.HostID, p.ScheduleID)
	if err != nil {
		if errors.Is(err, store.ErrNotFound) {
			s.abandonPending(ctx, p, "schedule gone")
			return
		}
		slog.Warn("drain pending: load schedule",
			"host_id", p.HostID, "schedule_id", p.ScheduleID, "err", err)
		return
	}
	if !sc.Enabled {
		s.abandonPending(ctx, p, "schedule disabled")
		return
	}
	g, err := s.deps.Store.GetSourceGroup(ctx, p.HostID, p.SourceGroupID)
	if err != nil {
		s.abandonPending(ctx, p, "source group gone")
		return
	}
	if g.RetryMax > 0 && p.Attempt >= g.RetryMax {
		s.abandonPending(ctx, p, "retry_max exceeded")
		return
	}
	// Calls dispatchBackupForGroupCore (not dispatchBackupForGroup) so a
	// failed Send doesn't double-enqueue: dispatchBackupForGroup's
	// enqueue-on-failure path would create a NEW pending_runs row while
	// this function already bumps the EXISTING row via
	// BumpPendingRunAttempt, producing geometric duplicates on repeated
	// failures.
	jobID, _ := s.dispatchBackupForGroupCore(ctx, conn, p.HostID, p.ScheduleID, g, p.ScheduledAt)
	if jobID == "" {
		// Send failed again. Bump attempt with exponential backoff.
		baseBackoff := time.Duration(g.RetryBackoffSeconds) * time.Second
		if baseBackoff <= 0 {
			baseBackoff = 60 * time.Second
		}
		backoff := baseBackoff
		for i := 0; i < p.Attempt; i++ {
			backoff *= 2
			if backoff >= pendingDrainBackoffMax {
				backoff = pendingDrainBackoffMax
				break
			}
		}
		next := time.Now().UTC().Add(backoff)
		if err := s.deps.Store.BumpPendingRunAttempt(ctx, p.ID, next, "drain dispatch failed"); err != nil {
			slog.Warn("drain pending: bump", "host_id", p.HostID, "id", p.ID, "err", err)
		}
		return
	}
	// Success — drop the pending row.
	if err := s.deps.Store.DeletePendingRun(ctx, p.ID); err != nil {
		slog.Warn("drain pending: delete after dispatch", "host_id", p.HostID, "id", p.ID, "err", err)
	}
	slog.Info("drain pending: dispatched",
		"host_id", p.HostID, "schedule_id", p.ScheduleID, "group", g.Name,
		"attempt", p.Attempt, "job_id", jobID)
}

// abandonPending deletes the row and records an audit entry. The row
// is gone but the audit trail preserves the forensic record of why.
func (s *Server) abandonPending(ctx context.Context, p store.PendingRun, reason string) {
	slog.Info("drain pending: abandoning",
		"host_id", p.HostID, "schedule_id", p.ScheduleID,
		"attempt", p.Attempt, "reason", reason)
	scheduleID := p.ScheduleID
	if err := s.deps.Store.AppendAudit(ctx, store.AuditEntry{
		ID:         ulid.Make().String(),
		Actor:      "system",
		Action:     "pending_run.abandoned",
		TargetKind: ptr("schedule"),
		TargetID:   &scheduleID,
		TS:         time.Now().UTC(),
	}); err != nil {
		slog.Warn("drain pending: audit on abandon", "id", p.ID, "err", err)
	}
	if err := s.deps.Store.DeletePendingRun(ctx, p.ID); err != nil {
		slog.Warn("drain pending: delete on abandon", "id", p.ID, "err", err)
	}
}

// DrainAllDue is the 30s-ticker entrypoint. Walks rows whose
// next_attempt_at <= now (DuePendingRuns), dedupes by host, and calls
// DrainPending per host. The DrainPending then re-walks the host's
// rows (same DB hit as the dedupe iteration would have done — keeps
// the per-host concurrency model simple).
func (s *Server) DrainAllDue(ctx context.Context) {
	if s.deps.Hub == nil {
		return
	}
	due, err := s.deps.Store.DuePendingRuns(ctx, time.Now().UTC(), pendingDrainBatchLimit)
	if err != nil {
		slog.Warn("drain all due: list", "err", err)
		return
	}
	if len(due) == 0 {
		return
	}
	seen := make(map[string]struct{}, len(due))
	for _, p := range due {
		if _, ok := seen[p.HostID]; ok {
			continue
		}
		seen[p.HostID] = struct{}{}
		if !s.deps.Hub.Connected(p.HostID) {
			continue
		}
		s.DrainPending(ctx, p.HostID)
	}
}