Files
restic-manager/internal/server/http/repo_ops.go
T
steve a45c801884 feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL).
A host with two source groups both failing collapsed onto one
backup_failed row — second failure bumped last_seen_at and
overwrote the message but never re-fan-out. Operators saw one
alert that appeared to flap, not two distinct broken things.

Schema changes (column-level ALTER, no rebuild):

- 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL,
  index). Populated for backup jobs in CreateJob.
- 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open
  partial index gets dropped and replaced with a UNIQUE partial
  index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL —
  the index is now the actual dedup primitive.

Plumbing:

- RaiseOrTouch / AutoResolve / Alert struct gain dedup_key.
- engine.JobFinishedEvent gains SourceGroupID; handleJobFinished
  passes it through for backup_failed only (forget/prune/check stay
  repo-scoped with key='').
- ws.handler reads SourceGroupID off the freshly-loaded job row.
- dispatchJobWithPayload gains a *string sourceGroupID arg; the
  per-group Run-now path and schedule.fire path pass &g.ID.

Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two
distinct groups produce two distinct open alerts and that resolving
one does not auto-resolve the other.

Dev tool: cmd/_fake_alert gains -dedup-key flag.
2026-05-04 22:59:48 +01:00

166 lines
5.2 KiB
Go

// repo_ops.go — operator-triggered Run-now for repo-level operations:
// prune, check, unlock. Backed by the same dispatchJobWithPayload
// pipeline as backup, with an extra step for prune: push admin creds
// first if they're set, refuse loudly if they aren't.
package http
import (
"errors"
"log/slog"
stdhttp "net/http"
"strconv"
"github.com/go-chi/chi/v5"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// handleRunRepoPrune — POST /api/hosts/{id}/repo/prune (and the HTMX
// twin outside /api). Pushes the host's admin credentials down the WS,
// then dispatches a prune command.run with RequiresAdminCreds=true.
func (s *Server) handleRunRepoPrune(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
if wantsHTML(r) {
stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther)
return
}
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "")
return
}
// Push admin creds first. ErrNotFound → operator hasn't set them
// yet. Other errors → likely the host is offline or a decrypt fail.
if err := s.pushAdminCredsToAgent(r.Context(), hostID); err != nil {
if errors.Is(err, store.ErrNotFound) {
s.runOpError(w, r, stdhttp.StatusBadRequest, "admin_creds_required",
"set admin credentials on the Repo page before running prune")
return
}
// Hub.Send failure (offline) or decrypt failure — surface a
// generic offline message so the operator retries when the
// agent is back.
slog.Warn("prune: push admin creds failed", "host_id", hostID, "err", err)
s.runOpError(w, r, stdhttp.StatusServiceUnavailable, "host_offline",
"agent is not currently connected; try again when it reconnects")
return
}
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobPrune, nil,
api.CommandRunPayload{RequiresAdminCreds: true})
if code != "" {
s.runOpError(w, r, status, code, msg)
return
}
s.runOpRedirect(w, r, res)
}
// handleRunRepoCheck — POST /api/hosts/{id}/repo/check. Pulls
// check_subset_pct from host_repo_maintenance for the host (operator
// can override via ?subset=N query param, clamped 0..100). Dispatches
// with the chosen subset in CommandRunPayload.Args[0].
func (s *Server) handleRunRepoCheck(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
if wantsHTML(r) {
stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther)
return
}
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "")
return
}
m, err := s.deps.Store.GetRepoMaintenance(r.Context(), hostID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
// Maintenance row should auto-seed at enrollment. If it's
// missing, surface a clear error rather than guessing 0%.
s.runOpError(w, r, stdhttp.StatusInternalServerError, "no_maintenance_row",
"host has no repo-maintenance config; was the host fully enrolled?")
return
}
s.runOpError(w, r, stdhttp.StatusInternalServerError, "internal", "")
return
}
subset := m.CheckSubsetPct
if q := r.URL.Query().Get("subset"); q != "" {
if n, err2 := strconv.Atoi(q); err2 == nil {
if n < 0 {
n = 0
}
if n > 100 {
n = 100
}
subset = n
}
// Non-numeric ?subset silently falls back to DB value.
}
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobCheck, nil,
api.CommandRunPayload{Args: []string{strconv.Itoa(subset)}})
if code != "" {
s.runOpError(w, r, status, code, msg)
return
}
s.runOpRedirect(w, r, res)
}
// handleRunRepoUnlock — POST /api/hosts/{id}/repo/unlock. No admin
// creds required — restic unlock works with the everyday user.
func (s *Server) handleRunRepoUnlock(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
if wantsHTML(r) {
stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther)
return
}
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "")
return
}
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobUnlock, nil,
api.CommandRunPayload{})
if code != "" {
s.runOpError(w, r, status, code, msg)
return
}
s.runOpRedirect(w, r, res)
}
// runOpRedirect: HTMX → HX-Redirect to /jobs/{id}; JSON → 202 + JSON
// body. Mirrors handleRunSourceGroup's tail.
func (s *Server) runOpRedirect(w stdhttp.ResponseWriter, r *stdhttp.Request, res runNowResponse) {
if wantsHTML(r) {
w.Header().Set("HX-Redirect", "/jobs/"+res.JobID)
w.WriteHeader(stdhttp.StatusNoContent)
return
}
writeJSON(w, stdhttp.StatusAccepted, res)
}
// runOpError: HTMX → plain-text status; JSON → standard envelope.
// Mirrors runGroupError.
func (s *Server) runOpError(w stdhttp.ResponseWriter, r *stdhttp.Request, status int, code, msg string) {
if wantsHTML(r) {
stdhttp.Error(w, msg, status)
return
}
writeJSONError(w, status, code, msg)
}