From 49ecb7c771590782171d231d85bd19bcd2f5de33 Mon Sep 17 00:00:00 2001
From: Steve Cliff <steve@devcloud.guru>
Date: Sat, 2 May 2026 20:54:01 +0100
Subject: [PATCH] =?UTF-8?q?P2=20redesign=20=C2=B7=20phase=201:=20migration?=
 =?UTF-8?q?=200008=20=E2=80=94=20sources=20+=20repo=20maintenance?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Schema rebuild for the model collapse described in
design/v4-sources-redesign.html. Three nouns now stand on their
own:

* schedules — slim. Only cron + enabled + host_id. Fat-schedule
  shape (paths/excludes/tags/retention/manual/kind/options/hooks)
  is dropped wholesale. Schedule data wiped — by design (smoke env
  was nuked before this ran; fresh installs have nothing to lose).
* source_groups — name + includes + excludes + retention_policy +
  retry policy + cached conflict_dimension. Group name doubles as
  the snapshot tag so retention can target it cleanly. UNIQUE
  (host_id, name) enforces tag unambiguity.
* schedule_source_groups — N:M junction. One schedule can fire N
  groups per tick; one group can be referenced by N schedules.
* host_repo_maintenance — 1:1 with hosts. Default cadences:
  forget daily 03:00, prune weekly Sun 04:00, check monthly 1st
  05:00 with --read-data-subset 5%. Operator can edit on Repo tab.
* pending_runs — offline-retry queue. Server-side ticker dispatches
  due rows; bounded by source_groups.retry_max + retry_backoff_seconds.

Plus:
* hosts.bandwidth_up_kbps / .bandwidth_down_kbps — host-wide caps.
* hosts.repo_initialised_at — DROPPED. Auto-init on enrol makes
  it derivable from the latest init job; the Init-repo button goes
  too (failure surfaces via job history banner).

Note on FK safety: smoke env was wiped before migration ran, so
DROP TABLE schedules cascades to nothing. Fresh installs apply
0001-0007 then immediately 0008 — same story (no schedule rows
to lose). For an upgrade path on a populated DB, this migration
would need a data-preserving variant; not needed today.

Tests fail to compile/run after this — expected. The Go side
(store types, CRUD, REST handlers, agent runner, UI templates)
gets rebuilt in subsequent phases. tasks.md will track P2 redesign
progress.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../0008_sources_and_repo_maintenance.sql     | 150 ++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 internal/store/migrations/0008_sources_and_repo_maintenance.sql

diff --git a/internal/store/migrations/0008_sources_and_repo_maintenance.sql b/internal/store/migrations/0008_sources_and_repo_maintenance.sql
new file mode 100644
index 0000000..07f5508
--- /dev/null
+++ b/internal/store/migrations/0008_sources_and_repo_maintenance.sql
@@ -0,0 +1,150 @@
+-- 0008_sources_and_repo_maintenance.sql
+--
+-- Phase 2 redesign — collapse the data model around three nouns:
+-- source groups (what to back up), schedules (when), and repo
+-- maintenance (forget/prune/check on host-level cadences). Drops
+-- everything that made the previous model leaky:
+--   - schedule.paths / schedule.excludes / schedule.tags     (now on source_groups)
+--   - schedule.retention_policy                              (now on source_groups)
+--   - schedule.kind                                          (only "backup" survives as a schedule kind)
+--   - schedule.manual                                        (replaced by per-group Run-now)
+--   - schedule.pre_hook / post_hook                          (deferred; will land on source_groups when hooks ship)
+--   - schedule.options                                       (bandwidth moves to host)
+--   - host.repo_initialised_at                               (derivable from latest init job)
+--
+-- All schedule data is wiped — by design, this is a clean rebuild.
+-- The smoke env was nuked before this migration ran; fresh installs
+-- have no data to lose either.
+--
+-- Migration 0007 left schedules.manual + the legacy fat-schedule
+-- shape in place. We drop the entire schedules table here and
+-- recreate slim. jobs.scheduled_id has ON DELETE SET NULL so
+-- existing job rows that still reference dropped schedule rows
+-- get NULLed out as a side-effect (smoke env: zero rows; fresh:
+-- no rows yet).
+
+PRAGMA foreign_keys = OFF;
+
+-- ----- 1. Drop the fat schedules table -----------------------------------
+
+DROP TABLE schedules;
+
+-- ----- 2. Recreate slim schedules ---------------------------------------
+-- Schedules now own only "when" + "which groups". One schedule fires
+-- N restic-backup invocations per cron tick — one per group, each
+-- tagged with the group's name so retention can target it.
+
+CREATE TABLE schedules (
+  id              TEXT PRIMARY KEY,
+  host_id         TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
+  cron_expr       TEXT NOT NULL,
+  enabled         INTEGER NOT NULL DEFAULT 1,
+  created_at      TEXT NOT NULL,
+  updated_at      TEXT NOT NULL
+);
+CREATE INDEX schedules_host_id ON schedules(host_id);
+
+-- ----- 3. Source groups -------------------------------------------------
+-- A source group bundles include + exclude paths plus the retention
+-- policy that applies to the snapshots it produces. Each group's
+-- name doubles as the snapshot tag; retention runs as
+--   restic forget --tag <name> --keep-* …
+-- per group on the host's nightly forget cadence.
+
+CREATE TABLE source_groups (
+  id                       TEXT PRIMARY KEY,
+  host_id                  TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
+  name                     TEXT NOT NULL,
+  includes                 TEXT NOT NULL DEFAULT '[]',  -- json array
+  excludes                 TEXT NOT NULL DEFAULT '[]',  -- json array
+  retention_policy         TEXT NOT NULL DEFAULT '{}',  -- json object: keep_last, keep_hourly, …
+  retry_max                INTEGER NOT NULL DEFAULT 3,
+  retry_backoff_seconds    INTEGER NOT NULL DEFAULT 60,
+  -- conflict_dimension is the cached name of the failing keep-* on
+  -- a granularity↔cadence mismatch (e.g. "hourly" when keep-hourly
+  -- is set but no schedule pointing at this group fires sub-daily).
+  -- NULL means no conflict. Refreshed on every schedule + group CRUD.
+  conflict_dimension       TEXT,
+  created_at               TEXT NOT NULL,
+  updated_at               TEXT NOT NULL,
+  -- Group names are unique per host so the snapshot-tag → group
+  -- mapping is unambiguous. Names can repeat across hosts.
+  UNIQUE (host_id, name)
+);
+CREATE INDEX source_groups_host_id ON source_groups(host_id);
+
+-- ----- 4. Schedule ↔ source group junction -------------------------------
+-- N:M. A schedule can point at multiple groups (one tick → N backups);
+-- a group can be referenced by multiple schedules (rapid hourly +
+-- daily checkpoint). ON DELETE CASCADE on either side prunes the
+-- junction row when its parent goes.
+
+CREATE TABLE schedule_source_groups (
+  schedule_id      TEXT NOT NULL REFERENCES schedules(id) ON DELETE CASCADE,
+  source_group_id  TEXT NOT NULL REFERENCES source_groups(id) ON DELETE CASCADE,
+  PRIMARY KEY (schedule_id, source_group_id)
+);
+CREATE INDEX schedule_source_groups_group_id
+  ON schedule_source_groups(source_group_id);
+
+-- ----- 5. Host repo maintenance -----------------------------------------
+-- forget / prune / check are repo-level operations (1:1 with host's
+-- repo, not per source-group). One row per host with sensible
+-- defaults; the agent runs each on its cron cadence.
+--
+-- forget runs per source-group internally: agent walks every
+-- enabled group on the host and runs `restic forget --tag <group>
+-- --keep-* …` with the group's own retention policy.
+--
+-- prune is heavy — weekly. check is monthly with --read-data-subset
+-- so a year's worth of monthly checks covers everything.
+
+CREATE TABLE host_repo_maintenance (
+  host_id            TEXT PRIMARY KEY REFERENCES hosts(id) ON DELETE CASCADE,
+  forget_cron        TEXT NOT NULL DEFAULT '0 3 * * *',
+  forget_enabled     INTEGER NOT NULL DEFAULT 1,
+  prune_cron         TEXT NOT NULL DEFAULT '0 4 * * 0',
+  prune_enabled      INTEGER NOT NULL DEFAULT 1,
+  check_cron         TEXT NOT NULL DEFAULT '0 5 1 * *',
+  check_enabled      INTEGER NOT NULL DEFAULT 1,
+  check_subset_pct   INTEGER NOT NULL DEFAULT 5
+);
+
+-- ----- 6. Pending runs (offline retry queue) ----------------------------
+-- When the agent is offline at fire time, the server schedules a
+-- retry instead of dropping the tick. Per source group: the group's
+-- retry_max + retry_backoff_seconds bound the loop. Server-side
+-- ticker polls due rows every ~30s and dispatches if Hub.Connected.
+-- Cleared on successful dispatch or attempts >= retry_max.
+
+CREATE TABLE pending_runs (
+  id                  TEXT PRIMARY KEY,
+  schedule_id         TEXT NOT NULL REFERENCES schedules(id) ON DELETE CASCADE,
+  source_group_id     TEXT NOT NULL REFERENCES source_groups(id) ON DELETE CASCADE,
+  host_id             TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
+  attempt             INTEGER NOT NULL DEFAULT 1,
+  next_attempt_at     TEXT NOT NULL,
+  scheduled_at        TEXT NOT NULL,                  -- original tick time, for forensic logging
+  last_error          TEXT
+);
+CREATE INDEX pending_runs_due ON pending_runs(next_attempt_at);
+CREATE INDEX pending_runs_host_id ON pending_runs(host_id);
+
+-- ----- 7. Bandwidth caps on hosts (host-wide, not per-group) -------------
+
+ALTER TABLE hosts ADD COLUMN bandwidth_up_kbps INTEGER;
+ALTER TABLE hosts ADD COLUMN bandwidth_down_kbps INTEGER;
+
+-- ----- 8. Drop host.repo_initialised_at ---------------------------------
+-- Auto-init on host enrol makes this derivable from the latest init
+-- job's status. The Init-repo button (red affordance) goes too;
+-- failure is surfaced by a job-history banner, not a button.
+
+ALTER TABLE hosts DROP COLUMN repo_initialised_at;
+
+-- ----- 9. host_schedule_version stays --------------------------------
+-- Still load-bearing: bumped on any source_group / schedule / junction
+-- CRUD. Pushed to the agent in the schedule.set payload alongside
+-- inline groups, ack'd in schedule.ack. Kept as-is.
+
+PRAGMA foreign_keys = ON;