Files
restic-manager/internal/server/http/maintenance_dispatch.go
steve 350be3f19d feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL).
A host with two source groups both failing collapsed onto one
backup_failed row — second failure bumped last_seen_at and
overwrote the message but never re-fan-out. Operators saw one
alert that appeared to flap, not two distinct broken things.

Schema changes (column-level ALTER, no rebuild):

- 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL,
  index). Populated for backup jobs in CreateJob.
- 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open
  partial index gets dropped and replaced with a UNIQUE partial
  index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL —
  the index is now the actual dedup primitive.

Plumbing:

- RaiseOrTouch / AutoResolve / Alert struct gain dedup_key.
- engine.JobFinishedEvent gains SourceGroupID; handleJobFinished
  passes it through for backup_failed only (forget/prune/check stay
  repo-scoped with key='').
- ws.handler reads SourceGroupID off the freshly-loaded job row.
- dispatchJobWithPayload gains a *string sourceGroupID arg; the
  per-group Run-now path and schedule.fire path pass &g.ID.

Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two
distinct groups produce two distinct open alerts and that resolving
one does not auto-resolve the other.

Dev tool: cmd/_fake_alert gains -dedup-key flag.
2026-05-04 22:59:48 +01:00

133 lines
4.9 KiB
Go

// maintenance_dispatch.go bridges the pure-logic maintenance.Ticker
// (internal/server/maintenance) to the side-effecting world: checks
// online state, builds the per-kind command.run payload, and calls
// dispatchJobWithPayload — the same path operator-triggered Run-now
// uses. Cadence-driven jobs are persisted with actor_kind="system"
// (dispatchJobWithPayload tags it that way when user==nil).
//
// Maintenance fires deliberately do NOT queue to pending_runs when
// the host is offline — five missed prunes on a laptop returning
// from a week away is not what the operator wants. Skip + log; the
// next 60s tick will re-evaluate.
package http
import (
"context"
"errors"
"log/slog"
"strconv"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// DispatchMaintenance acts on each Decision from the ticker. Offline
// hosts are skipped (logged); prune dispatches without admin creds
// are skipped silently (logged) — the operator hasn't completed the
// admin-creds setup yet, and re-trying every minute would just spam
// the logs. (Operator-triggered prune via the run-now endpoint
// returns a clear error instead — different path, different UX.)
func (s *Server) DispatchMaintenance(ctx context.Context, decisions []maintenance.Decision) {
for _, d := range decisions {
if !s.deps.Hub.Connected(d.HostID) {
slog.Info("maintenance: host offline, skipping",
"host_id", d.HostID, "kind", d.Kind)
continue
}
switch d.Kind {
case "forget":
payload, ok := s.buildForgetPayloadForHost(ctx, d.HostID)
if !ok {
slog.Info("maintenance: forget skipped — no source groups with retention",
"host_id", d.HostID)
continue
}
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobForget, nil, payload)
if code != "" {
slog.Warn("maintenance: forget dispatch failed",
"host_id", d.HostID, "code", code, "msg", msg)
}
case "prune":
if _, err := s.deps.Store.GetHostCredentials(ctx, d.HostID, store.CredKindAdmin); err != nil {
if errors.Is(err, store.ErrNotFound) {
slog.Info("maintenance: prune skipped — no admin creds",
"host_id", d.HostID)
continue
}
slog.Warn("maintenance: prune skipped — admin creds error",
"host_id", d.HostID, "err", err)
continue
}
if err := s.pushAdminCredsToAgent(ctx, d.HostID); err != nil {
slog.Warn("maintenance: prune push admin creds failed",
"host_id", d.HostID, "err", err)
continue
}
payload := api.CommandRunPayload{RequiresAdminCreds: true}
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobPrune, nil, payload)
if code != "" {
slog.Warn("maintenance: prune dispatch failed",
"host_id", d.HostID, "code", code, "msg", msg)
}
case "check":
payload := api.CommandRunPayload{Args: []string{strconv.Itoa(d.SubsetPct)}}
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobCheck, nil, payload)
if code != "" {
slog.Warn("maintenance: check dispatch failed",
"host_id", d.HostID, "code", code, "msg", msg)
}
default:
slog.Warn("maintenance: unknown decision kind",
"host_id", d.HostID, "kind", d.Kind)
}
}
}
// buildForgetPayloadForHost collects every source group on the host
// that has a non-empty retention policy and builds a CommandRunPayload
// with ForgetGroups populated. Returns ok=false if the host has no
// such groups (the dispatcher then skips this kind).
func (s *Server) buildForgetPayloadForHost(ctx context.Context, hostID string) (api.CommandRunPayload, bool) {
groups, err := s.deps.Store.ListSourceGroupsByHost(ctx, hostID)
if err != nil {
slog.Warn("maintenance: list source groups failed", "host_id", hostID, "err", err)
return api.CommandRunPayload{}, false
}
fg := make([]api.ForgetGroup, 0, len(groups))
for _, g := range groups {
if isEmptyRetention(g.RetentionPolicy) {
continue
}
fg = append(fg, api.ForgetGroup{
Tag: g.Name,
Policy: forgetPolicyJSONFromStore(g.RetentionPolicy),
})
}
if len(fg) == 0 {
return api.CommandRunPayload{}, false
}
return api.CommandRunPayload{ForgetGroups: fg}, true
}
func isEmptyRetention(p store.RetentionPolicy) bool {
return p.KeepLast == nil && p.KeepHourly == nil &&
p.KeepDaily == nil && p.KeepWeekly == nil &&
p.KeepMonthly == nil && p.KeepYearly == nil
}
// forgetPolicyJSONFromStore copies retention pointers from the store
// view to the wire view. Both shapes are field-for-field identical;
// this avoids importing store from internal/api (which would invert
// the dependency direction).
func forgetPolicyJSONFromStore(p store.RetentionPolicy) api.ForgetPolicyJSON {
return api.ForgetPolicyJSON{
KeepLast: p.KeepLast,
KeepHourly: p.KeepHourly,
KeepDaily: p.KeepDaily,
KeepWeekly: p.KeepWeekly,
KeepMonthly: p.KeepMonthly,
KeepYearly: p.KeepYearly,
}
}