P2 redesign · phase 2: store rewrite — sources, slim schedules, repo maintenance

Go-side data model rebuilt against migration 0008. The fat-Schedule shape (paths/excludes/tags/retention/manual/kind/options/hooks) is gone; that surface lives on source_groups now. * store/types.go - Schedule slimmed to {id, host_id, cron, enabled, source_group_ids, timestamps}. SourceGroupIDs populated by Get/List, accepted on Create/Update so callers pass desired junction state in one shape. - SourceGroup added: name (= snapshot tag), includes/excludes, retention_policy, retry_max + retry_backoff_seconds, cached conflict_dimension. - HostRepoMaintenance added: forget/prune/check cadences + enabled. - PendingRun added: offline-retry queue. - Host loses RepoInitialisedAt; gains BandwidthUpKBps + BandwidthDownKBps. - RetentionPolicy moves home from "schedule field" to "source group field" but the type itself + Summary() method unchanged. * store/sources.go (new) — CRUD + GetByName + ConflictDimension cache. Group writes bump host_schedule_version; conflict cache writes don't (server-internal projection, agent doesn't see it). * store/maintenance.go (new) — CreateDefault is idempotent (INSERT OR IGNORE). UpdateRepoMaintenance doesn't bump schedule version because these run on the server's own ticker, not the agent's local cron. * store/pending.go (new) — Enqueue / DueRunsForRetry / Bump / Delete. * store/schedules.go — rewritten for slim shape + junction CRUD. Update wipes the schedule_source_groups junction wholesale and re-inserts (simpler than diffing). Adds SchedulesUsingGroup for retention-conflict detection + UI labels. * store/hosts.go — drops repo_initialised_at scan, adds bandwidth scan. New SetHostBandwidth helper. * HTTP layer — temporarily stubbed during this rewrite (501 returns with redesign_in_progress error code). Phase 3 fills these in against the new shape: - schedules.go REST CRUD - schedule_push.go agent reconciliation - ui_schedules.go HTML form CRUD Run-now-per-host + Init-repo handlers in ui_handlers.go also stubbed — both go away in the new model (Run-now per source group; auto-init at host enrolment). * enrollment.go — replaces "seed manual schedule from typed paths" with "seed default source group + repo-maintenance row." The default group gets the typed paths as its includes; operator edits later via Sources tab. * ws/handler.go — drops the MarkHostRepoInitialised projection (column is gone; auto-init makes it derivable from latest init job's status). Tests: * store: existing schedule test rewritten for slim shape + junction; new sources_test.go covers source-group CRUD, name uniqueness, conflict cache, repo-maintenance defaults + idempotent seed, pending-runs queue lifecycle. * http: schedules_test.go and schedule_push_test.go deleted — both exercised the obsolete fat-schedule API. Phase 3 rewrites them against the new endpoints. go test ./... green. cmd/server + cmd/agent build. The UI is broken end-to-end (schedules / sources / repo tabs all hit 501 stubs); Phase 3 restores REST + on-the-wire reconciliation; Phase 4 rewires the UI templates against the new model. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 21:30:41 +01:00
parent e717b6998c
commit e7eea7afac
16 changed files with 1076 additions and 1928 deletions
@@ -0,0 +1,103 @@
+package store
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+)
+
+// EnqueuePendingRun queues a missed cron tick for the offline-retry
+// ticker to dispatch later. Caller (the schedule firing path) sets
+// next_attempt_at = now + group.retry_backoff_seconds × 2^(attempt-1).
+func (st *Store) EnqueuePendingRun(ctx context.Context, p *PendingRun) error {
+	if p.ID == "" || p.ScheduleID == "" || p.SourceGroupID == "" || p.HostID == "" {
+		return errors.New("store: pending run id, schedule_id, source_group_id, host_id required")
+	}
+	if p.Attempt == 0 {
+		p.Attempt = 1
+	}
+	if p.NextAttemptAt.IsZero() {
+		p.NextAttemptAt = time.Now().UTC()
+	}
+	if p.ScheduledAt.IsZero() {
+		p.ScheduledAt = time.Now().UTC()
+	}
+	_, err := st.db.ExecContext(ctx,
+		`INSERT INTO pending_runs (id, schedule_id, source_group_id, host_id,
+			attempt, next_attempt_at, scheduled_at, last_error)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
+		p.ID, p.ScheduleID, p.SourceGroupID, p.HostID,
+		p.Attempt,
+		p.NextAttemptAt.UTC().Format(time.RFC3339Nano),
+		p.ScheduledAt.UTC().Format(time.RFC3339Nano),
+		nullableString(p.LastError))
+	if err != nil {
+		return fmt.Errorf("store: enqueue pending run: %w", err)
+	}
+	return nil
+}
+
+// DuePendingRuns returns rows whose next_attempt_at <= now, ordered
+// oldest first. Server-side ticker calls this every ~30s.
+func (st *Store) DuePendingRuns(ctx context.Context, now time.Time, limit int) ([]PendingRun, error) {
+	rows, err := st.db.QueryContext(ctx,
+		`SELECT id, schedule_id, source_group_id, host_id, attempt,
+			next_attempt_at, scheduled_at, COALESCE(last_error, '')
+		 FROM pending_runs
+		 WHERE next_attempt_at <= ?
+		 ORDER BY next_attempt_at
+		 LIMIT ?`,
+		now.UTC().Format(time.RFC3339Nano), limit)
+	if err != nil {
+		return nil, fmt.Errorf("store: due pending runs: %w", err)
+	}
+	defer rows.Close()
+	out := []PendingRun{}
+	for rows.Next() {
+		var p PendingRun
+		var nextAt, scheduledAt string
+		if err := rows.Scan(&p.ID, &p.ScheduleID, &p.SourceGroupID, &p.HostID,
+			&p.Attempt, &nextAt, &scheduledAt, &p.LastError); err != nil {
+			return nil, err
+		}
+		if t, err := time.Parse(time.RFC3339Nano, nextAt); err == nil {
+			p.NextAttemptAt = t
+		}
+		if t, err := time.Parse(time.RFC3339Nano, scheduledAt); err == nil {
+			p.ScheduledAt = t
+		}
+		out = append(out, p)
+	}
+	return out, rows.Err()
+}
+
+// DeletePendingRun removes a row by id. Called after successful
+// dispatch or after exceeding retry_max.
+func (st *Store) DeletePendingRun(ctx context.Context, id string) error {
+	_, err := st.db.ExecContext(ctx,
+		`DELETE FROM pending_runs WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("store: delete pending run: %w", err)
+	}
+	return nil
+}
+
+// BumpPendingRunAttempt increments the attempt counter and updates
+// next_attempt_at + last_error. Used after a failed retry — caller
+// has decided to try again.
+func (st *Store) BumpPendingRunAttempt(ctx context.Context, id string, nextAttemptAt time.Time, lastError string) error {
+	_, err := st.db.ExecContext(ctx,
+		`UPDATE pending_runs SET
+			attempt = attempt + 1,
+			next_attempt_at = ?,
+			last_error = ?
+		 WHERE id = ?`,
+		nextAttemptAt.UTC().Format(time.RFC3339Nano),
+		nullableString(lastError),
+		id)
+	if err != nil {
+		return fmt.Errorf("store: bump pending run: %w", err)
+	}
+	return nil
+}