Files
restic-manager/internal/server/http/run_group.go
T
steve a45c801884 feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL).
A host with two source groups both failing collapsed onto one
backup_failed row — second failure bumped last_seen_at and
overwrote the message but never re-fan-out. Operators saw one
alert that appeared to flap, not two distinct broken things.

Schema changes (column-level ALTER, no rebuild):

- 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL,
  index). Populated for backup jobs in CreateJob.
- 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open
  partial index gets dropped and replaced with a UNIQUE partial
  index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL —
  the index is now the actual dedup primitive.

Plumbing:

- RaiseOrTouch / AutoResolve / Alert struct gain dedup_key.
- engine.JobFinishedEvent gains SourceGroupID; handleJobFinished
  passes it through for backup_failed only (forget/prune/check stay
  repo-scoped with key='').
- ws.handler reads SourceGroupID off the freshly-loaded job row.
- dispatchJobWithPayload gains a *string sourceGroupID arg; the
  per-group Run-now path and schedule.fire path pass &g.ID.

Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two
distinct groups produce two distinct open alerts and that resolving
one does not auto-resolve the other.

Dev tool: cmd/_fake_alert gains -dedup-key flag.
2026-05-04 22:59:48 +01:00

133 lines
4.3 KiB
Go

// run_group.go — per-source-group Run-now endpoint.
//
// POST /hosts/{id}/source-groups/{gid}/run dispatches a backup job
// against the resolved includes/excludes/retention/tag of the named
// group. Replaces the old per-host /hosts/{id}/run-backup route (now
// 410 Gone).
package http
import (
"errors"
stdhttp "net/http"
"strconv"
"github.com/go-chi/chi/v5"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// parseBandwidthOverride pulls optional bandwidth_up_kbps /
// bandwidth_down_kbps from the request (form or query). Returns nil
// for any field absent or empty; an explicit "0" produces a non-nil
// pointer to 0 — i.e., "no cap for this run, even if the host has
// one set." Non-integers / negatives are rejected with an error.
func parseBandwidthOverride(r *stdhttp.Request) (up *int, down *int, err error) {
parse := func(name string) (*int, error) {
v := r.FormValue(name)
if v == "" {
return nil, nil
}
n, perr := strconv.Atoi(v)
if perr != nil {
return nil, errors.New(name + " must be an integer")
}
if n < 0 {
return nil, errors.New(name + " must be >= 0")
}
return &n, nil
}
up, err = parse("bandwidth_up_kbps")
if err != nil {
return nil, nil, err
}
down, err = parse("bandwidth_down_kbps")
return up, down, err
}
func (s *Server) handleRunSourceGroup(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
// HTML callers redirect to login; for JSON return 401.
if wantsHTML(r) {
stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther)
return
}
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
return
}
hostID := chi.URLParam(r, "id")
groupID := chi.URLParam(r, "gid")
g, err := s.deps.Store.GetSourceGroup(r.Context(), hostID, groupID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
s.runGroupError(w, r, stdhttp.StatusNotFound, "group_not_found",
"source group not found on this host")
return
}
s.runGroupError(w, r, stdhttp.StatusInternalServerError, "internal", "")
return
}
// Optional per-run bandwidth override. Disclosed in the UI under a
// <details> "Limit bandwidth for this run" affordance; absent on
// the wire (and from JSON callers that don't supply it) means
// "fall back to the host's standing caps."
upOverride, downOverride, perr := parseBandwidthOverride(r)
if perr != nil {
s.runGroupError(w, r, stdhttp.StatusBadRequest, "invalid_value", perr.Error())
return
}
// Resolve hooks (group → host default → empty). Best-effort host
// lookup; failure proceeds with no hook rather than block the run.
var preHook, postHook string
if host, herr := s.deps.Store.GetHost(r.Context(), hostID); herr == nil {
preHook, postHook = s.resolveBackupHooks(host, g)
}
// Backup invocations don't consume RetentionPolicy — that lives on
// forget. Sending the resolved set here would just be dead weight.
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobBackup, &g.ID,
api.CommandRunPayload{
Includes: g.Includes,
Excludes: g.Excludes,
Tag: g.Name,
BandwidthUpKBps: upOverride,
BandwidthDownKBps: downOverride,
PreHook: preHook,
PostHook: postHook,
})
if code != "" {
s.runGroupError(w, r, status, code, msg)
return
}
if wantsHTML(r) {
// HTMX action: redirect to the live job log so the operator
// sees streaming output immediately.
w.Header().Set("HX-Redirect", "/jobs/"+res.JobID)
w.WriteHeader(stdhttp.StatusNoContent)
return
}
writeJSON(w, stdhttp.StatusAccepted, res)
}
// runGroupError dispatches an error to JSON callers as the standard
// envelope; HTMX callers get a 4xx with a plain text body so the
// browser surfaces it via the existing toast handler.
func (s *Server) runGroupError(w stdhttp.ResponseWriter, r *stdhttp.Request, status int, code, msg string) {
if wantsHTML(r) {
stdhttp.Error(w, msg, status)
return
}
writeJSONError(w, status, code, msg)
}
// wantsHTML keys off HX-Request only. Browsers sending a default
// Accept (or curl's `*/*`) get the JSON shape, which is the safer
// default for non-htmx clients. HTMX always sets HX-Request=true on
// its action POSTs, so the form path is unambiguous.
func wantsHTML(r *stdhttp.Request) bool {
return r.Header.Get("HX-Request") == "true"
}