From 49ecb7c771590782171d231d85bd19bcd2f5de33 Mon Sep 17 00:00:00 2001 From: Steve Cliff Date: Sat, 2 May 2026 20:54:01 +0100 Subject: [PATCH] =?UTF-8?q?P2=20redesign=20=C2=B7=20phase=201:=20migration?= =?UTF-8?q?=200008=20=E2=80=94=20sources=20+=20repo=20maintenance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Schema rebuild for the model collapse described in design/v4-sources-redesign.html. Three nouns now stand on their own: * schedules — slim. Only cron + enabled + host_id. Fat-schedule shape (paths/excludes/tags/retention/manual/kind/options/hooks) is dropped wholesale. Schedule data wiped — by design (smoke env was nuked before this ran; fresh installs have nothing to lose). * source_groups — name + includes + excludes + retention_policy + retry policy + cached conflict_dimension. Group name doubles as the snapshot tag so retention can target it cleanly. UNIQUE (host_id, name) enforces tag unambiguity. * schedule_source_groups — N:M junction. One schedule can fire N groups per tick; one group can be referenced by N schedules. * host_repo_maintenance — 1:1 with hosts. Default cadences: forget daily 03:00, prune weekly Sun 04:00, check monthly 1st 05:00 with --read-data-subset 5%. Operator can edit on Repo tab. * pending_runs — offline-retry queue. Server-side ticker dispatches due rows; bounded by source_groups.retry_max + retry_backoff_seconds. Plus: * hosts.bandwidth_up_kbps / .bandwidth_down_kbps — host-wide caps. * hosts.repo_initialised_at — DROPPED. Auto-init on enrol makes it derivable from the latest init job; the Init-repo button goes too (failure surfaces via job history banner). Note on FK safety: smoke env was wiped before migration ran, so DROP TABLE schedules cascades to nothing. Fresh installs apply 0001-0007 then immediately 0008 — same story (no schedule rows to lose). For an upgrade path on a populated DB, this migration would need a data-preserving variant; not needed today. Tests fail to compile/run after this — expected. The Go side (store types, CRUD, REST handlers, agent runner, UI templates) gets rebuilt in subsequent phases. tasks.md will track P2 redesign progress. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../0008_sources_and_repo_maintenance.sql | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 internal/store/migrations/0008_sources_and_repo_maintenance.sql diff --git a/internal/store/migrations/0008_sources_and_repo_maintenance.sql b/internal/store/migrations/0008_sources_and_repo_maintenance.sql new file mode 100644 index 0000000..07f5508 --- /dev/null +++ b/internal/store/migrations/0008_sources_and_repo_maintenance.sql @@ -0,0 +1,150 @@ +-- 0008_sources_and_repo_maintenance.sql +-- +-- Phase 2 redesign — collapse the data model around three nouns: +-- source groups (what to back up), schedules (when), and repo +-- maintenance (forget/prune/check on host-level cadences). Drops +-- everything that made the previous model leaky: +-- - schedule.paths / schedule.excludes / schedule.tags (now on source_groups) +-- - schedule.retention_policy (now on source_groups) +-- - schedule.kind (only "backup" survives as a schedule kind) +-- - schedule.manual (replaced by per-group Run-now) +-- - schedule.pre_hook / post_hook (deferred; will land on source_groups when hooks ship) +-- - schedule.options (bandwidth moves to host) +-- - host.repo_initialised_at (derivable from latest init job) +-- +-- All schedule data is wiped — by design, this is a clean rebuild. +-- The smoke env was nuked before this migration ran; fresh installs +-- have no data to lose either. +-- +-- Migration 0007 left schedules.manual + the legacy fat-schedule +-- shape in place. We drop the entire schedules table here and +-- recreate slim. jobs.scheduled_id has ON DELETE SET NULL so +-- existing job rows that still reference dropped schedule rows +-- get NULLed out as a side-effect (smoke env: zero rows; fresh: +-- no rows yet). + +PRAGMA foreign_keys = OFF; + +-- ----- 1. Drop the fat schedules table ----------------------------------- + +DROP TABLE schedules; + +-- ----- 2. Recreate slim schedules --------------------------------------- +-- Schedules now own only "when" + "which groups". One schedule fires +-- N restic-backup invocations per cron tick — one per group, each +-- tagged with the group's name so retention can target it. + +CREATE TABLE schedules ( + id TEXT PRIMARY KEY, + host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, + cron_expr TEXT NOT NULL, + enabled INTEGER NOT NULL DEFAULT 1, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); +CREATE INDEX schedules_host_id ON schedules(host_id); + +-- ----- 3. Source groups ------------------------------------------------- +-- A source group bundles include + exclude paths plus the retention +-- policy that applies to the snapshots it produces. Each group's +-- name doubles as the snapshot tag; retention runs as +-- restic forget --tag --keep-* … +-- per group on the host's nightly forget cadence. + +CREATE TABLE source_groups ( + id TEXT PRIMARY KEY, + host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, + name TEXT NOT NULL, + includes TEXT NOT NULL DEFAULT '[]', -- json array + excludes TEXT NOT NULL DEFAULT '[]', -- json array + retention_policy TEXT NOT NULL DEFAULT '{}', -- json object: keep_last, keep_hourly, … + retry_max INTEGER NOT NULL DEFAULT 3, + retry_backoff_seconds INTEGER NOT NULL DEFAULT 60, + -- conflict_dimension is the cached name of the failing keep-* on + -- a granularity↔cadence mismatch (e.g. "hourly" when keep-hourly + -- is set but no schedule pointing at this group fires sub-daily). + -- NULL means no conflict. Refreshed on every schedule + group CRUD. + conflict_dimension TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + -- Group names are unique per host so the snapshot-tag → group + -- mapping is unambiguous. Names can repeat across hosts. + UNIQUE (host_id, name) +); +CREATE INDEX source_groups_host_id ON source_groups(host_id); + +-- ----- 4. Schedule ↔ source group junction ------------------------------- +-- N:M. A schedule can point at multiple groups (one tick → N backups); +-- a group can be referenced by multiple schedules (rapid hourly + +-- daily checkpoint). ON DELETE CASCADE on either side prunes the +-- junction row when its parent goes. + +CREATE TABLE schedule_source_groups ( + schedule_id TEXT NOT NULL REFERENCES schedules(id) ON DELETE CASCADE, + source_group_id TEXT NOT NULL REFERENCES source_groups(id) ON DELETE CASCADE, + PRIMARY KEY (schedule_id, source_group_id) +); +CREATE INDEX schedule_source_groups_group_id + ON schedule_source_groups(source_group_id); + +-- ----- 5. Host repo maintenance ----------------------------------------- +-- forget / prune / check are repo-level operations (1:1 with host's +-- repo, not per source-group). One row per host with sensible +-- defaults; the agent runs each on its cron cadence. +-- +-- forget runs per source-group internally: agent walks every +-- enabled group on the host and runs `restic forget --tag +-- --keep-* …` with the group's own retention policy. +-- +-- prune is heavy — weekly. check is monthly with --read-data-subset +-- so a year's worth of monthly checks covers everything. + +CREATE TABLE host_repo_maintenance ( + host_id TEXT PRIMARY KEY REFERENCES hosts(id) ON DELETE CASCADE, + forget_cron TEXT NOT NULL DEFAULT '0 3 * * *', + forget_enabled INTEGER NOT NULL DEFAULT 1, + prune_cron TEXT NOT NULL DEFAULT '0 4 * * 0', + prune_enabled INTEGER NOT NULL DEFAULT 1, + check_cron TEXT NOT NULL DEFAULT '0 5 1 * *', + check_enabled INTEGER NOT NULL DEFAULT 1, + check_subset_pct INTEGER NOT NULL DEFAULT 5 +); + +-- ----- 6. Pending runs (offline retry queue) ---------------------------- +-- When the agent is offline at fire time, the server schedules a +-- retry instead of dropping the tick. Per source group: the group's +-- retry_max + retry_backoff_seconds bound the loop. Server-side +-- ticker polls due rows every ~30s and dispatches if Hub.Connected. +-- Cleared on successful dispatch or attempts >= retry_max. + +CREATE TABLE pending_runs ( + id TEXT PRIMARY KEY, + schedule_id TEXT NOT NULL REFERENCES schedules(id) ON DELETE CASCADE, + source_group_id TEXT NOT NULL REFERENCES source_groups(id) ON DELETE CASCADE, + host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, + attempt INTEGER NOT NULL DEFAULT 1, + next_attempt_at TEXT NOT NULL, + scheduled_at TEXT NOT NULL, -- original tick time, for forensic logging + last_error TEXT +); +CREATE INDEX pending_runs_due ON pending_runs(next_attempt_at); +CREATE INDEX pending_runs_host_id ON pending_runs(host_id); + +-- ----- 7. Bandwidth caps on hosts (host-wide, not per-group) ------------- + +ALTER TABLE hosts ADD COLUMN bandwidth_up_kbps INTEGER; +ALTER TABLE hosts ADD COLUMN bandwidth_down_kbps INTEGER; + +-- ----- 8. Drop host.repo_initialised_at --------------------------------- +-- Auto-init on host enrol makes this derivable from the latest init +-- job's status. The Init-repo button (red affordance) goes too; +-- failure is surfaced by a job-history banner, not a button. + +ALTER TABLE hosts DROP COLUMN repo_initialised_at; + +-- ----- 9. host_schedule_version stays -------------------------------- +-- Still load-bearing: bumped on any source_group / schedule / junction +-- CRUD. Pushed to the agent in the schedule.set payload alongside +-- inline groups, ack'd in schedule.ack. Kept as-is. + +PRAGMA foreign_keys = ON;