Files
restic-manager/internal/server/http/server.go
T
steve 94149a7324 P3-X1: cancel-job feature
Wires the existing job_detail Cancel button (which was a UI stub) into
real backend behaviour:

- internal/api already declared MsgCommandCancel + CommandCancelPayload;
  promote those from forward-declarations to a working envelope. Agent
  side: cmd/agent/main.go drops the TODO-stub and gains a per-job
  ctx.CancelFunc map. runJob's switch is refactored around a small
  spawn() helper so each kind's goroutine derives a per-job context,
  registers the cancel, and removes itself on completion regardless of
  outcome. command.cancel looks up the func and fires it.
- internal/agent/runner.sendFinished now takes ctx and rebadges
  ctx.Canceled errors as JobCancelled (exit 130) rather than
  JobFailed. All Run* call sites updated.
- internal/restic.resticCmd sets cmd.Cancel to send SIGTERM (via
  build-tagged sigterm constant; os.Kill on Windows since SIGTERM
  isn't deliverable there) and cmd.WaitDelay=5s for the SIGKILL
  fallback. SIGTERM lets restic remove its lock file before exiting.
- New POST /api/jobs/{id}/cancel server endpoint validates the job
  is non-terminal and the host is online, sends command.cancel via
  the hub, writes a job.cancel audit row, returns 202. The agent's
  resulting job.finished (status=cancelled) is what actually
  transitions the row.

Tests:
- internal/server/http/cancel_test.go covers happy path (envelope
  shape + audit row), 409 for terminal jobs, 404 for missing jobs,
  503 for offline hosts.
- internal/agent/runner/cancel_test.go covers cancel mid-run: a fake
  restic that exec'd into 'sleep 30' is canceled 150ms after start
  and the resulting job.finished reports JobCancelled with exit 130
  in well under the WaitDelay.

Foundational for P3 restore (operator needs to be able to cancel a
running backup if they need to restore urgently). Independently useful
for prune/check/backup that are stuck.
2026-05-04 15:11:49 +01:00

309 lines
13 KiB
Go

// Package http hosts the chi-based REST handlers for the control
// plane. The Server type owns the router, the handlers, and the
// graceful-shutdown lifecycle.
package http
import (
"context"
"errors"
stdhttp "net/http"
"sync"
"time"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// Deps bundles every collaborator the HTTP server depends on. Wired up
// in cmd/server; tests pass a pared-down Deps with fakes.
type Deps struct {
Cfg config.Config
Store *store.Store
AEAD *crypto.AEAD
Hub *ws.Hub
JobHub *ws.JobHub
UI *ui.Renderer
// Version is the binary's build version, surfaced in the chrome.
// Empty falls back to "dev".
Version string
// BootstrapToken (optional, populated only on first run) is the raw
// admin-bootstrap token printed in the server logs. While set, the
// /bootstrap endpoint accepts it to create the first admin user.
BootstrapToken string
}
// Server is the running HTTP server.
type Server struct {
srv *stdhttp.Server
deps Deps
// drainLocks serializes DrainPending per host. The on-hello
// goroutine and the 30s ticker can otherwise race for the same
// host, double-dispatching every pending row. Map of hostID →
// sync.Mutex; checked-and-locked atomically via drainLocksMu.
drainLocksMu sync.Mutex
drainLocks map[string]*sync.Mutex
// announceRL is the per-source-IP token-bucket guarding
// POST /api/agents/announce (P2-18). One process-local map.
announceRL *announceLimiter
// pendingHub holds live /ws/agent/pending sockets keyed by
// pending_id so the accept/reject handlers can push the bearer
// or close cleanly (P2-18b).
pendingHub *pendingHub
}
// New builds a configured but not-yet-started server.
func New(deps Deps) *Server {
r := chi.NewRouter()
// Built-in middleware: request ID for log correlation, recovery
// (don't crash the process on a panic in a handler), realIP iff a
// trusted proxy is configured.
r.Use(middleware.RequestID)
r.Use(middleware.Recoverer)
r.Use(requestLogger)
// Health endpoint — unauthenticated, no audit, deliberately cheap.
r.Get("/healthz", func(w stdhttp.ResponseWriter, _ *stdhttp.Request) {
w.WriteHeader(stdhttp.StatusNoContent)
})
s := &Server{
deps: deps,
drainLocks: make(map[string]*sync.Mutex),
announceRL: newAnnounceLimiter(),
pendingHub: newPendingHub(),
}
s.routes(r)
s.srv = &stdhttp.Server{
Addr: deps.Cfg.Listen,
Handler: r,
ReadHeaderTimeout: 10 * time.Second,
IdleTimeout: 60 * time.Second,
// Long write timeout — WS upgrades and live log streams need it.
WriteTimeout: 0,
}
return s
}
// routes wires the API tree. Subtrees live in this file by area so a
// reader can scan one place and see the surface.
func (s *Server) routes(r chi.Router) {
r.Route("/api", func(r chi.Router) {
r.Post("/auth/login", s.handleLogin)
r.Post("/auth/logout", s.handleLogout)
r.Post("/bootstrap", s.handleBootstrap)
// Agent enrollment (open endpoint — token is the credential).
r.Post("/agents/enroll", s.handleAgentEnroll)
// Announce-and-approve enrolment (open endpoint — fingerprint
// comparison in the UI is the gate). Per-IP rate-limited and
// globally capped (P2-18).
r.Post("/agents/announce", s.handleAnnounce)
// Pending host management — admin-only (gated inside the handler).
r.Post("/pending-hosts/{id}/accept", s.handleAcceptPendingHost)
r.Post("/pending-hosts/{id}/reject", s.handleRejectPendingHost)
// Operator → server (authenticated). Spec.md §6.1's
// /hosts/{id}/enrollment-token (regenerate) lands when the
// host page can call it; for now just the create endpoint.
r.Post("/enrollment-tokens", s.handleCreateEnrollmentToken)
// Fleet read endpoints — back the dashboard.
r.Get("/hosts", s.handleListHosts)
r.Get("/fleet/summary", s.handleFleetSummary)
// Run-now: dispatch a job to a host's agent.
r.Post("/hosts/{id}/jobs", s.handleRunNow)
// Snapshot projection (refreshed by the agent after each backup).
r.Get("/hosts/{id}/snapshots", s.handleListHostSnapshots)
// Repo credentials — operator can edit after enrollment. The
// initial set is supplied at token-mint time (see enrollment.go).
// GET returns a redacted view (URL, username, has_password).
r.Get("/hosts/{id}/repo-credentials", s.handleGetHostCredentials)
r.Put("/hosts/{id}/repo-credentials", s.handleSetHostCredentials)
// Admin credentials — the prune-capable slot (separate from the
// everyday repo creds). Optional: hosts that don't prune against
// a rest-server repo with a separate admin user never need this.
r.Get("/hosts/{id}/admin-credentials", s.handleGetAdminCredentials)
r.Put("/hosts/{id}/admin-credentials", s.handleSetAdminCredentials)
r.Delete("/hosts/{id}/admin-credentials", s.handleDeleteAdminCredentials)
// Per-host schedule CRUD. Mutations bump host_schedule_version
// and async-push to a connected agent (see schedule_push.go).
r.Get("/hosts/{id}/schedules", s.handleListSchedules)
r.Post("/hosts/{id}/schedules", s.handleCreateSchedule)
r.Put("/hosts/{id}/schedules/{sid}", s.handleUpdateSchedule)
r.Delete("/hosts/{id}/schedules/{sid}", s.handleDeleteSchedule)
// Source-group CRUD. A group is "what gets backed up" — paths,
// excludes, retention, retry. Group name doubles as the
// snapshot tag (restic --tag <name>).
r.Get("/hosts/{id}/source-groups", s.handleListSourceGroups)
r.Post("/hosts/{id}/source-groups", s.handleCreateSourceGroup)
r.Get("/hosts/{id}/source-groups/{gid}", s.handleGetSourceGroup)
r.Put("/hosts/{id}/source-groups/{gid}", s.handleUpdateSourceGroup)
r.Delete("/hosts/{id}/source-groups/{gid}", s.handleDeleteSourceGroup)
// Repo maintenance cadences (forget / prune / check). Driven
// by the server-side ticker (P2R-06), not the agent's cron.
r.Get("/hosts/{id}/repo-maintenance", s.handleGetRepoMaintenance)
r.Put("/hosts/{id}/repo-maintenance", s.handleUpdateRepoMaintenance)
// Host-wide bandwidth caps (host.bandwidth_up_kbps /
// bandwidth_down_kbps). Apply to every restic invocation.
r.Put("/hosts/{id}/bandwidth", s.handleUpdateHostBandwidth)
// Per-source-group Run-now (JSON variant). HTMX action is
// mounted at the equivalent path outside /api below — both
// resolve to the same handler, which sniffs HX-Request.
r.Post("/hosts/{id}/source-groups/{gid}/run", s.handleRunSourceGroup)
// Repo-level run-now: prune (needs admin creds), check, unlock.
// HTMX forms are also mounted outside /api below.
r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune)
r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck)
r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock)
// Cancel a running job. Operator-driven, sends command.cancel
// to the agent which kills the restic subprocess; the agent's
// resulting job.finished (status=canceled) is what flips the
// job row.
r.Post("/jobs/{id}/cancel", s.handleCancelJob)
})
// Per-source-group Run-now (HTMX form action). Available even
// when the server is started without UI templates so REST callers
// against the non-/api path also work.
r.Post("/hosts/{id}/source-groups/{gid}/run", s.handleRunSourceGroup)
// Repo-level run-now (HTMX form actions). Same handlers as the /api
// variants — wantsHTML sniff distinguishes JSON vs HTMX response.
r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune)
r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck)
r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock)
// Retired routes — see ui_handlers.go for the messages. Mounted
// outside the UI gate so cached browser tabs get a clear 410
// even if the server runs without templates.
r.Post("/hosts/{id}/run-backup", s.handleUIRunBackupGone)
r.Post("/hosts/{id}/init-repo", s.handleUIInitRepoGone)
// Pending-host WebSocket (announce-and-approve, P2-18b). Mounted
// before /ws/agent so the more-specific route matches first.
r.Get("/ws/agent/pending", s.handlePendingWS)
// Agent ↔ server WebSocket. Bearer-authenticated inside the handler.
if s.deps.Hub != nil {
r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{
Hub: s.deps.Hub,
Store: s.deps.Store,
JobHub: s.deps.JobHub,
OnHello: s.onAgentHello,
OnScheduleAck: s.applyScheduleAck,
OnScheduleFire: s.dispatchScheduledJob,
}))
}
// Agent binaries + install scripts. Open endpoints — content is
// unprivileged on its own, gating happens via the enrollment
// token. See agent_assets.go.
r.Get("/agent/binary", s.handleAgentBinary)
r.Get("/install/*", s.handleInstallAsset)
// Static assets (Tailwind CSS bundle, future favicon).
r.Mount("/static/", staticHandler())
// HTML UI. The renderer is required — fail loud if the binary
// was built without templates (impossible in practice given
// embed, but guards bad test wiring).
if s.deps.UI != nil {
r.Get("/", s.handleUIDashboard)
r.Get("/login", s.handleUILoginGet)
r.Post("/login", s.handleUILoginPost)
r.Post("/logout", s.handleUILogoutPost)
// Per-host Run-now and manual Init-repo are mounted at the
// outer router (so they reply 410 even without UI). Per-
// source-group Run-now lives there too — same reason.
// Add host flow.
r.Get("/hosts/new", s.handleUIAddHostGet)
r.Post("/hosts/new", s.handleUIAddHostPost)
// Durable post-Add-host page (operator can refresh / come
// back; password decrypted from the token row each render).
// Polled fragment under /awaiting flips to "connected" once
// the agent enrolls.
r.Get("/hosts/pending/{token}", s.handleUIPendingHost)
r.Get("/hosts/pending/{token}/awaiting", s.handleUIPendingAwaiting)
// Host detail (Snapshots tab is the default).
r.Get("/hosts/{id}", s.handleUIHostDetail)
// Sources tab + source-group CRUD forms.
r.Get("/hosts/{id}/sources", s.handleUIHostSources)
r.Get("/hosts/{id}/sources/new", s.handleUISourceGroupNewGet)
r.Post("/hosts/{id}/sources/new", s.handleUISourceGroupSave)
r.Get("/hosts/{id}/sources/{gid}/edit", s.handleUISourceGroupEditGet)
r.Post("/hosts/{id}/sources/{gid}/edit", s.handleUISourceGroupSave)
r.Post("/hosts/{id}/sources/{gid}/delete", s.handleUISourceGroupDelete)
// Repo tab — connection / bandwidth / maintenance. Three
// independent forms so saving one doesn't touch the others.
r.Get("/hosts/{id}/repo", s.handleUIHostRepo)
r.Post("/hosts/{id}/repo/credentials", s.handleUIRepoCredentialsSave)
r.Post("/hosts/{id}/repo/bandwidth", s.handleUIRepoBandwidthSave)
r.Post("/hosts/{id}/repo/maintenance", s.handleUIRepoMaintenanceSave)
r.Post("/hosts/{id}/repo/reinit", s.handleUIRepoReinit)
r.Post("/hosts/{id}/repo/hooks", s.handleUIRepoHooksSave)
// Admin credentials form (separate slot for prune-capable user).
r.Post("/hosts/{id}/admin-credentials", s.handleUIAdminCredentialsSave)
r.Post("/hosts/{id}/admin-credentials/delete", s.handleUIAdminCredentialsDelete)
// Schedules tab + create/edit/delete forms.
r.Get("/hosts/{id}/schedules", s.handleUISchedulesList)
r.Get("/hosts/{id}/schedules/new", s.handleUIScheduleNewGet)
r.Post("/hosts/{id}/schedules/new", s.handleUIScheduleSave)
r.Get("/hosts/{id}/schedules/{sid}/edit", s.handleUIScheduleEditGet)
r.Post("/hosts/{id}/schedules/{sid}/edit", s.handleUIScheduleSave)
r.Post("/hosts/{id}/schedules/{sid}/delete", s.handleUIScheduleDelete)
r.Post("/hosts/{id}/schedules/{sid}/run", s.handleUIScheduleRun)
// Live job log.
r.Get("/jobs/{id}", s.handleUIJobDetail)
}
// Browser job-log stream (separate from /ws/agent so the auth
// layer is session-cookie not bearer). Mounted regardless of
// whether the UI is up — JSON callers may also subscribe.
if s.deps.JobHub != nil {
r.Get("/api/jobs/{id}/stream", s.handleJobStream)
}
}
// Start begins listening. Blocks until ListenAndServe returns
// (typically only on Shutdown). The server is HTTP-only by design;
// production deployments terminate TLS at a reverse proxy in front.
func (s *Server) Start() error {
err := s.srv.ListenAndServe()
if errors.Is(err, stdhttp.ErrServerClosed) {
return nil
}
return err
}
// Shutdown stops accepting new connections and waits up to ctx.Deadline
// for in-flight handlers to finish.
func (s *Server) Shutdown(ctx context.Context) error {
return s.srv.Shutdown(ctx)
}
// Addr returns the configured listen address. Useful in tests when
// the caller passes :0 to get a random port.
func (s *Server) Addr() string { return s.srv.Addr }