Files
restic-manager/internal/server/ws/handler.go
T
steve bd434bd1d0
CI / Test (linux/amd64) (push) Has been cancelled
CI / Lint (push) Has been cancelled
CI / Build (windows/amd64) (push) Has been cancelled
CI / Build (linux/amd64) (push) Has been cancelled
CI / Build (linux/arm64) (push) Has been cancelled
P1-26: live job log viewer + WS browser fan-out hub
Closes the P1-21 remainder.

internal/server/ws/jobhub.go — new JobHub. Per-job_id set of
subscribers; each gets a 64-deep buffered channel with a writer
goroutine. Broadcast is non-blocking: if a subscriber is slow,
its channel fills and messages are dropped for that subscriber
only — the agent's read loop is never blocked by a stuck browser.

The agent dispatchAgentMessage path mirrors job.started /
job.progress / log.stream / job.finished envelopes onto the hub
in addition to its existing persistence work. The wire shape is
the same end-to-end, so client-side JS switches on env.type the
same way Go code does.

GET /api/jobs/{id}/stream is the browser endpoint. Auth via
session cookie (HTTP layer); upgrade; subscribe; pump until
context closes.

GET /jobs/{id} renders the live log page. Three states (queued/
running/succeeded/failed) drive the header pill, the progress
bar block, the failure summary panel, and the action button
(Cancel job while running, Back to host afterwards). Already-
persisted log lines are server-rendered on initial load; new
lines arrive over the WS and append to #log-stream. Auto-scrolls
unless the user scrolls up (a "⇢ Follow" pill re-attaches).
On job.finished the page reloads after 600ms to pick up the
final-state header rendered server-side.

POST /hosts/{id}/run-backup now sets HX-Redirect → /jobs/{job_id}
on success so HTMX lands the operator straight on the live log.
For non-HTMX callers (curl / plain form post) it 303s to the
same target.

store.ListJobLogs returns persisted log lines for initial render
on page load.

Browser-verified end-to-end: enrol → run a real backup against a
sibling restic/rest-server → live progress + 11 log lines stream
in → succeeded pill + final stats land after page reload.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 21:45:56 +01:00

261 lines
8.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package ws
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
stdhttp "net/http"
"strings"
"time"
"github.com/coder/websocket"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// HandlerDeps is the set of collaborators the agent WS handler needs.
type HandlerDeps struct {
Hub *Hub
Store *store.Store
JobHub *JobHub
// OnHello is called once per successful hello, after the host row
// has been touched and the conn registered. Used by the HTTP
// layer to push host_credentials down as a config.update before
// the agent starts asking for jobs. Optional; nil = no-op.
OnHello func(ctx context.Context, hostID string, conn *Conn)
}
// AgentHandler is the http.Handler that owns /ws/agent. Agents
// authenticate with `Authorization: Bearer <token>` (issued at
// enrollment) before the WS upgrade.
//
// Lifecycle:
// 1. Bearer token resolves to a Host row.
// 2. Upgrade.
// 3. First message must be `hello`; protocol_version checked here.
// 4. Loop: read messages, dispatch by type. Heartbeats touch the
// host row; job/log/repo messages forward to the relevant
// handlers (TODO: lands with P1-18 onward).
// 5. On Read error or context cancel, mark host offline, unregister
// from the hub.
func AgentHandler(deps HandlerDeps) stdhttp.Handler {
return stdhttp.HandlerFunc(func(w stdhttp.ResponseWriter, r *stdhttp.Request) {
host, ok := authenticateAgent(r, deps.Store)
if !ok {
stdhttp.Error(w, "unauthorized", stdhttp.StatusUnauthorized)
return
}
conn, err := websocket.Accept(w, r, &websocket.AcceptOptions{
InsecureSkipVerify: true, // Origin checks are pointless for an agent CLI.
})
if err != nil {
slog.Warn("ws accept failed", "err", err, "host_id", host.ID)
return
}
c := NewConn(host.ID, conn)
// Keep agents alive across NAT boxes; coder/websocket
// auto-pings under the hood when configured. The default 60s
// works fine for a 30s heartbeat cadence.
runAgentLoop(r.Context(), c, host.ID, deps)
})
}
// authenticateAgent returns the host that owns the bearer token in
// the request, or (nil, false) if anything is amiss. The same
// "false" path is used for missing header, malformed header, unknown
// token — no information leak about why.
func authenticateAgent(r *stdhttp.Request, st *store.Store) (*store.Host, bool) {
hdr := r.Header.Get("Authorization")
const prefix = "Bearer "
if !strings.HasPrefix(hdr, prefix) {
return nil, false
}
token := strings.TrimPrefix(hdr, prefix)
if token == "" {
return nil, false
}
h, err := st.LookupHostByAgentToken(r.Context(), auth.HashToken(token))
if err != nil {
return nil, false
}
return h, true
}
// runAgentLoop is the per-connection driver. Returns when the socket
// is closed for any reason. It owns the hub registration: register on
// hello acceptance, unregister on exit.
func runAgentLoop(ctx context.Context, c *Conn, hostID string, deps HandlerDeps) {
// Stage 1: hello (with a tight deadline).
helloCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
hello, err := c.Read(helloCtx)
cancel()
if err != nil {
slog.Info("ws hello read failed", "host_id", hostID, "err", err)
_ = c.Close()
return
}
if hello.Type != api.MsgHello {
c.SendError(ctx, api.ErrBadRequest, "first message must be hello", "")
return
}
var helloPayload api.HelloPayload
if err := hello.UnmarshalPayload(&helloPayload); err != nil {
c.SendError(ctx, api.ErrBadRequest, "malformed hello payload", "")
return
}
if helloPayload.ProtocolVersion < api.MinAgentProtocolVersion {
c.SendError(ctx, api.ErrProtocolTooOld,
fmt.Sprintf("agent protocol_version %d below minimum %d",
helloPayload.ProtocolVersion, api.MinAgentProtocolVersion),
"https://restic-manager.example/docs/upgrade")
return
}
if helloPayload.ProtocolVersion > api.CurrentProtocolVersion {
// Forward-compat is fine — newer agents talking to older
// servers should accept their lower version. Just log it.
slog.Info("ws agent newer than server",
"host_id", hostID,
"agent_proto", helloPayload.ProtocolVersion,
"server_proto", api.CurrentProtocolVersion)
}
now := time.Now().UTC()
if err := deps.Store.MarkHostHello(ctx, hostID,
helloPayload.AgentVersion, helloPayload.ResticVersion,
helloPayload.ProtocolVersion, now); err != nil {
slog.Error("ws mark host hello failed", "host_id", hostID, "err", err)
}
deps.Hub.Register(hostID, c)
defer deps.Hub.Unregister(hostID, c)
defer func() { _ = c.Close() }()
slog.Info("ws agent connected",
"host_id", hostID,
"agent_version", helloPayload.AgentVersion,
"protocol_version", helloPayload.ProtocolVersion)
if deps.OnHello != nil {
// Run synchronously so the config.update lands before any
// command.run an operator might race in.
deps.OnHello(ctx, hostID, c)
}
// Stage 2: main read loop.
for {
env, err := c.Read(ctx)
if err != nil {
if !errors.Is(err, context.Canceled) {
slog.Info("ws agent read loop ended", "host_id", hostID, "err", err)
}
return
}
dispatchAgentMessage(ctx, c, hostID, env, deps)
}
}
// dispatchAgentMessage routes a single envelope to its handler.
func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.Envelope, deps HandlerDeps) {
switch env.Type {
case api.MsgHeartbeat:
_ = deps.Store.TouchHost(ctx, hostID, time.Now().UTC())
case api.MsgJobStarted:
var p api.JobStartedPayload
_ = env.UnmarshalPayload(&p)
if err := deps.Store.MarkJobStarted(ctx, p.JobID, p.StartedAt); err != nil {
slog.Warn("ws: mark job started", "job_id", p.JobID, "err", err)
}
if deps.JobHub != nil {
deps.JobHub.Broadcast(p.JobID, env)
}
case api.MsgJobProgress:
// Progress ticks aren't persisted (1Hz × every job × every
// path-walk would dwarf the rest of the DB). The live UI
// subscribes to JobHub and gets them in real time; once a
// job finishes the final summary lands via job.finished.
var p api.JobProgressPayload
_ = env.UnmarshalPayload(&p)
if deps.JobHub != nil {
deps.JobHub.Broadcast(p.JobID, env)
}
case api.MsgJobFinished:
var p api.JobFinishedPayload
_ = env.UnmarshalPayload(&p)
errMsg := p.Error
if err := deps.Store.MarkJobFinished(ctx, p.JobID,
string(p.Status), p.ExitCode, p.Stats, errMsg, p.FinishedAt); err != nil {
slog.Warn("ws: mark job finished", "job_id", p.JobID, "err", err)
}
if deps.JobHub != nil {
deps.JobHub.Broadcast(p.JobID, env)
}
case api.MsgLogStream:
var p api.LogStreamLine
_ = env.UnmarshalPayload(&p)
if err := deps.Store.AppendJobLog(ctx, p.JobID, p.Seq, p.TS,
string(p.Stream), p.Payload); err != nil {
slog.Warn("ws: append job log", "job_id", p.JobID, "err", err)
}
if deps.JobHub != nil {
deps.JobHub.Broadcast(p.JobID, env)
}
case api.MsgSnapshotsRpt:
var p api.SnapshotsReportPayload
if err := env.UnmarshalPayload(&p); err != nil {
slog.Warn("ws: bad snapshots.report payload", "host_id", hostID, "err", err)
break
}
snaps := make([]store.Snapshot, len(p.Snapshots))
for i, s := range p.Snapshots {
snaps[i] = store.Snapshot{
ID: s.ID,
ShortID: s.ShortID,
Time: s.Time,
Hostname: s.Hostname,
Paths: s.Paths,
Tags: s.Tags,
SizeBytes: s.SizeBytes,
FileCount: s.FileCount,
}
}
if err := deps.Store.ReplaceHostSnapshots(ctx, hostID, snaps, time.Now().UTC()); err != nil {
slog.Warn("ws: replace snapshots", "host_id", hostID, "err", err)
} else {
slog.Info("ws: snapshots refreshed", "host_id", hostID, "count", len(snaps))
}
case api.MsgRepoStats, api.MsgScheduleAck, api.MsgCommandResult:
// TODO(P2): persist these projections.
slog.Debug("ws msg not yet handled", "type", env.Type, "host_id", hostID)
case api.MsgError:
var ep api.ErrorPayload
_ = env.UnmarshalPayload(&ep)
slog.Warn("ws agent reported error", "host_id", hostID,
"code", string(ep.Code), "message", ep.Message)
default:
slog.Warn("ws unknown message type from agent",
"type", env.Type, "host_id", hostID)
}
}
// MinHeartbeatInterval is a sanity floor — any agent reporting
// heartbeats more often than this is misbehaving. (Spec says 30s.)
const MinHeartbeatInterval = 5 * time.Second
// suppress unused-import false-positives if json drops out later
var _ = json.Marshal