Files
restic-manager/internal/server/http/jobs.go
T
steve 350be3f19d feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL).
A host with two source groups both failing collapsed onto one
backup_failed row — second failure bumped last_seen_at and
overwrote the message but never re-fan-out. Operators saw one
alert that appeared to flap, not two distinct broken things.

Schema changes (column-level ALTER, no rebuild):

- 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL,
  index). Populated for backup jobs in CreateJob.
- 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open
  partial index gets dropped and replaced with a UNIQUE partial
  index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL —
  the index is now the actual dedup primitive.

Plumbing:

- RaiseOrTouch / AutoResolve / Alert struct gain dedup_key.
- engine.JobFinishedEvent gains SourceGroupID; handleJobFinished
  passes it through for backup_failed only (forget/prune/check stay
  repo-scoped with key='').
- ws.handler reads SourceGroupID off the freshly-loaded job row.
- dispatchJobWithPayload gains a *string sourceGroupID arg; the
  per-group Run-now path and schedule.fire path pass &g.ID.

Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two
distinct groups produce two distinct open alerts and that resolving
one does not auto-resolve the other.

Dev tool: cmd/_fake_alert gains -dedup-key flag.
2026-05-04 22:59:48 +01:00

166 lines
5.1 KiB
Go

package http
import (
"context"
"encoding/json"
stdhttp "net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// runNowRequest is the body of POST /api/hosts/:id/jobs.
type runNowRequest struct {
Kind api.JobKind `json:"kind"`
Args []string `json:"args,omitempty"` // restic CLI args (paths for backup, etc.)
}
type runNowResponse struct {
JobID string `json:"job_id"`
Status string `json:"status"` // "queued"
}
// handleRunNow dispatches a job to the named host. Authenticated;
// rejects if the host isn't connected (caller should retry once
// the agent comes back).
func (s *Server) handleRunNow(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
writeJSONError(w, stdhttp.StatusBadRequest, "missing_host_id", "")
return
}
var req runNowRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error())
return
}
res, status, code, msg := s.dispatchJob(r.Context(), user, hostID, req.Kind, req.Args)
if code != "" {
writeJSONError(w, status, code, msg)
return
}
writeJSON(w, stdhttp.StatusAccepted, res)
}
// dispatchJob is the common path for HTTP-driven job dispatch. It
// validates the kind, checks the host is online, persists the job
// row, and ships command.run over the WS. Returns:
// - res: the queued-job response (job_id + status)
// - status: HTTP status to return on failure (or 0 on success)
// - code, msg: error code/message for the wire (empty on success)
//
// JSON callers wrap with writeJSONError; HTML callers translate to
// flash banner + redirect.
func (s *Server) dispatchJob(ctx context.Context, user *store.User,
hostID string, kind api.JobKind, args []string,
) (res runNowResponse, status int, code, msg string) {
return s.dispatchJobWithPayload(ctx, user, hostID, kind, nil, api.CommandRunPayload{
Kind: kind,
Args: args,
})
}
// dispatchJobWithPayload is dispatchJob's variant that lets callers
// fill in structured fields (Includes/Excludes/Tag/ForgetGroups/RequiresAdminCreds)
// — used by the per-source-group Run-now path. JobID is filled in
// here; callers leave it zero on the input payload.
//
// sourceGroupID is the dedup key the alert engine will key on for
// backup_failed. Pass non-nil for backups; nil for prune/check/unlock
// (those are repo-scoped and dedup at host_id only).
func (s *Server) dispatchJobWithPayload(ctx context.Context, user *store.User,
hostID string, kind api.JobKind, sourceGroupID *string, payload api.CommandRunPayload,
) (res runNowResponse, status int, code, msg string) {
if !validJobKind(kind) {
return res, stdhttp.StatusBadRequest, "invalid_kind",
"kind must be one of backup|forget|prune|check|unlock"
}
host, err := s.deps.Store.GetHost(ctx, hostID)
if err != nil {
return res, stdhttp.StatusNotFound, "host_not_found", ""
}
if !s.deps.Hub.Connected(host.ID) {
return res, stdhttp.StatusServiceUnavailable, "host_offline",
"agent is not currently connected; try again when it reconnects"
}
jobID := ulid.Make().String()
now := time.Now().UTC()
var actorID *string
actor := "system"
if user != nil {
actor = "user"
actorID = &user.ID
}
if err := s.deps.Store.CreateJob(ctx, store.Job{
ID: jobID,
HostID: host.ID,
Kind: string(kind),
SourceGroupID: sourceGroupID,
ActorKind: actor,
ActorID: actorID,
CreatedAt: now,
}); err != nil {
return res, stdhttp.StatusInternalServerError, "internal", ""
}
payload.JobID = jobID
payload.Kind = kind
env, err := api.Marshal(api.MsgCommandRun, jobID, payload)
if err != nil {
return res, stdhttp.StatusInternalServerError, "internal", ""
}
if err := s.deps.Hub.Send(ctx, host.ID, env); err != nil {
return res, stdhttp.StatusServiceUnavailable, "host_offline", err.Error()
}
_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
ID: ulid.Make().String(),
UserID: actorID,
Actor: actor,
Action: "job.run_now",
TargetKind: ptr("job"),
TargetID: &jobID,
TS: now,
})
return runNowResponse{JobID: jobID, Status: "queued"}, 0, "", ""
}
// requireUser resolves the session cookie to a user row. Stub of the
// session-auth middleware that lands in P1-04's full pass.
func (s *Server) requireUser(r *stdhttp.Request) (*store.User, bool) {
c, err := r.Cookie(sessionCookieName)
if err != nil {
return nil, false
}
sess, err := s.deps.Store.LookupSession(r.Context(), auth.HashToken(c.Value))
if err != nil {
return nil, false
}
u, err := s.deps.Store.GetUserByID(r.Context(), sess.UserID)
if err != nil {
return nil, false
}
return u, true
}
func validJobKind(k api.JobKind) bool {
switch k {
case api.JobBackup, api.JobInit, api.JobForget, api.JobPrune,
api.JobCheck, api.JobUnlock, api.JobRestore, api.JobDiff:
return true
}
return false
}