phase 1: run-now backup — restic wrapper, job lifecycle, end-to-end
Lands the operator → server → agent → restic → server roundtrip for
on-demand backups. The flow:
POST /api/hosts/{id}/jobs {kind:"backup",args:["/path"]}
→ server creates a queued Job row
→ server emits command.run over WS to the host's agent
→ agent dispatcher spawns runner.RunBackup in a goroutine
→ runner spawns `restic backup --json`, parses each line
→ forwards: job.started, log.stream (every line), job.progress
(throttled to 1/sec), job.finished (with summary stats blob)
→ server WS handler persists those into jobs / job_logs
P1-16 internal/restic: thin Locate + Env wrapper that runs `restic
backup --json`, scans stdout/stderr, parses BackupStatus +
BackupSummary, calls back into a LineHandler so the agent can fan
out to log.stream + job.progress. Treats exit code 3 as
"succeeded with issues" (matches restic's contract).
P1-18 store: jobs accessors (CreateJob, MarkJobStarted,
MarkJobFinished, AppendJobLog, GetJob).
P1-19 server: POST /api/hosts/{id}/jobs creates the Job row,
validates kind, dispatches via Hub.Send, audit-logs the action.
P1-20 agent runner: wraps restic.RunBackup with throttled progress
emission. Sender abstraction was added to wsclient.Handler so
background goroutines can keep replying after dispatch returns.
P1-21 server WS: dispatchAgentMessage now persists job.started,
job.finished, log.stream into the database. Browser fan-out for
live tailing lands with the UI work.
Agent gets repo_url + repo_password from agent.yaml in plaintext
for now (mode 0600, owned by service user); spec.md §7.3's keyring
storage moves there in P2. config.update over WS overrides the
in-memory copy (does not persist).
Build clean; all tests pass. End-to-end with a real restic still
needs a host that has restic installed — wire shape verified by
the existing hello/heartbeat round-trip test.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -33,6 +33,15 @@ type Config struct {
|
||||
// ResticPath overrides the auto-detected restic binary path.
|
||||
ResticPath string `yaml:"restic_path,omitempty"`
|
||||
|
||||
// RepoURL + RepoPassword are the credentials this host uses to
|
||||
// reach its restic repository. Phase 1 keeps these in plaintext
|
||||
// in agent.yaml (mode 0600 owned by the agent service user); the
|
||||
// server-pushed config.update message can override them in
|
||||
// memory. Phase 2 moves them into the OS keyring (DPAPI on
|
||||
// Windows, Secret Service on Linux).
|
||||
RepoURL string `yaml:"repo_url,omitempty"`
|
||||
RepoPassword string `yaml:"repo_password,omitempty"`
|
||||
|
||||
// path is the file we loaded from. Used by Save.
|
||||
path string `yaml:"-"`
|
||||
}
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
// Package runner spawns restic processes for the agent. It owns one
|
||||
// Run() invocation per command.run; concurrency limits live a layer
|
||||
// up (the WS handler).
|
||||
package runner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/restic"
|
||||
)
|
||||
|
||||
// Sender is the agent's outbound message channel. Provided by
|
||||
// wsclient so the runner can push job.started / job.progress /
|
||||
// job.finished / log.stream back to the server.
|
||||
type Sender interface {
|
||||
Send(env api.Envelope) error
|
||||
}
|
||||
|
||||
// Config bundles the long-lived settings the runner needs. They come
|
||||
// from the agent's config file (server-pushed config.update payloads
|
||||
// override these in memory).
|
||||
type Config struct {
|
||||
ResticBin string
|
||||
RepoURL string
|
||||
RepoPassword string
|
||||
}
|
||||
|
||||
// Runner owns the restic invocations.
|
||||
type Runner struct {
|
||||
cfg Config
|
||||
tx Sender
|
||||
|
||||
// progress throttling: we receive a status event from restic
|
||||
// every ~100ms; the UI doesn't need anywhere near that rate.
|
||||
// Cap WS sends to one per N (configurable; default 1s).
|
||||
progressMinPeriod time.Duration
|
||||
}
|
||||
|
||||
// New builds a Runner. progressMinPeriod = 0 uses the default 1s.
|
||||
func New(cfg Config, tx Sender, progressMinPeriod time.Duration) *Runner {
|
||||
if progressMinPeriod <= 0 {
|
||||
progressMinPeriod = time.Second
|
||||
}
|
||||
return &Runner{cfg: cfg, tx: tx, progressMinPeriod: progressMinPeriod}
|
||||
}
|
||||
|
||||
// RunBackup executes a backup job and reports back via the sender.
|
||||
// Returns nil on a clean (or "incomplete-but-snapshot-created") finish.
|
||||
func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string) error {
|
||||
startedAt := time.Now().UTC()
|
||||
|
||||
startEnv, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{
|
||||
JobID: jobID, Kind: api.JobBackup, StartedAt: startedAt,
|
||||
})
|
||||
if err := r.tx.Send(startEnv); err != nil {
|
||||
slog.Warn("runner: send job.started", "err", err)
|
||||
}
|
||||
|
||||
env := restic.Env{
|
||||
Bin: r.cfg.ResticBin,
|
||||
RepoURL: r.cfg.RepoURL,
|
||||
RepoPassword: r.cfg.RepoPassword,
|
||||
}
|
||||
|
||||
var seq atomic.Int64
|
||||
lastProgress := time.Now()
|
||||
|
||||
handle := func(stream string, line string, ev any) {
|
||||
// Forward every line to the server as log.stream.
|
||||
now := time.Now().UTC()
|
||||
logEnv, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
|
||||
JobID: jobID,
|
||||
Seq: seq.Add(1),
|
||||
TS: now,
|
||||
Stream: api.LogStream(stream),
|
||||
Payload: line,
|
||||
})
|
||||
_ = r.tx.Send(logEnv)
|
||||
|
||||
// Throttled progress events.
|
||||
if status, ok := ev.(restic.BackupStatus); ok {
|
||||
if time.Since(lastProgress) < r.progressMinPeriod {
|
||||
return
|
||||
}
|
||||
lastProgress = time.Now()
|
||||
progEnv, _ := api.Marshal(api.MsgJobProgress, jobID, api.JobProgressPayload{
|
||||
JobID: jobID,
|
||||
PercentDone: status.PercentDone,
|
||||
FilesDone: status.FilesDone,
|
||||
TotalFiles: status.TotalFiles,
|
||||
BytesDone: status.BytesDone,
|
||||
TotalBytes: status.TotalBytes,
|
||||
ETASeconds: status.SecondsRem,
|
||||
ThroughputBps: throughput(status.BytesDone, status.SecondsElapsed),
|
||||
})
|
||||
_ = r.tx.Send(progEnv)
|
||||
}
|
||||
}
|
||||
|
||||
summary, err := env.RunBackup(ctx, paths, excludes, tags, handle)
|
||||
finishedAt := time.Now().UTC()
|
||||
|
||||
status := api.JobSucceeded
|
||||
exit := 0
|
||||
errMsg := ""
|
||||
if err != nil {
|
||||
status = api.JobFailed
|
||||
exit = -1
|
||||
errMsg = err.Error()
|
||||
}
|
||||
var statsBlob json.RawMessage
|
||||
if summary != nil {
|
||||
statsBlob, _ = json.Marshal(summary)
|
||||
}
|
||||
finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
|
||||
JobID: jobID,
|
||||
Status: status,
|
||||
ExitCode: exit,
|
||||
FinishedAt: finishedAt,
|
||||
Stats: statsBlob,
|
||||
Error: errMsg,
|
||||
})
|
||||
_ = r.tx.Send(finEnv)
|
||||
if err != nil {
|
||||
return fmt.Errorf("runner backup: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func throughput(bytesDone, secondsElapsed int64) int64 {
|
||||
if secondsElapsed <= 0 {
|
||||
return 0
|
||||
}
|
||||
return bytesDone / secondsElapsed
|
||||
}
|
||||
@@ -19,6 +19,7 @@ import (
|
||||
stdhttp "net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/coder/websocket"
|
||||
@@ -36,10 +37,19 @@ type Config struct {
|
||||
HelloPayload api.HelloPayload
|
||||
}
|
||||
|
||||
// Handler is invoked for every server-sent message. The agent's main
|
||||
// program supplies one that knows how to dispatch command.run etc.
|
||||
// to the runner package.
|
||||
type Handler func(ctx context.Context, env api.Envelope) error
|
||||
// Sender is what handlers use to push agent → server messages
|
||||
// (job.progress, job.finished, log.stream, command.result, …).
|
||||
// Returned by the WS client to the dispatch handler. Write operations
|
||||
// serialise behind a single mutex on the conn; concurrent calls are
|
||||
// safe.
|
||||
type Sender interface {
|
||||
Send(env api.Envelope) error
|
||||
}
|
||||
|
||||
// Handler is invoked for every server-sent message. tx lets the
|
||||
// handler push replies back; it is valid only for the lifetime of
|
||||
// the connection (calls fail if the agent has reconnected since).
|
||||
type Handler func(ctx context.Context, env api.Envelope, tx Sender) error
|
||||
|
||||
// Run keeps the agent connected indefinitely. Returns when ctx is
|
||||
// cancelled. Errors during a single connection attempt are logged and
|
||||
@@ -107,6 +117,8 @@ func connectOnce(ctx context.Context, cfg Config, handle Handler) error {
|
||||
}
|
||||
slog.Info("ws agent connected", "server", wsURL)
|
||||
|
||||
tx := &connSender{conn: conn, ctx: ctx}
|
||||
|
||||
// Heartbeat goroutine.
|
||||
heartbeatCtx, cancelHeartbeat := context.WithCancel(ctx)
|
||||
defer cancelHeartbeat()
|
||||
@@ -138,13 +150,34 @@ func connectOnce(ctx context.Context, cfg Config, handle Handler) error {
|
||||
continue
|
||||
}
|
||||
if handle != nil {
|
||||
if err := handle(ctx, env); err != nil {
|
||||
if err := handle(ctx, env, tx); err != nil {
|
||||
slog.Warn("ws agent: handler returned error", "type", env.Type, "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// connSender is the per-connection Sender. Goroutines beyond the
|
||||
// read loop (e.g. a backup running in its own goroutine) keep a
|
||||
// reference to one of these for the duration of their work.
|
||||
type connSender struct {
|
||||
conn *websocket.Conn
|
||||
ctx context.Context
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
func (s *connSender) Send(env api.Envelope) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
raw, err := json.Marshal(env)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
writeCtx, cancel := context.WithTimeout(s.ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
return s.conn.Write(writeCtx, websocket.MessageText, raw)
|
||||
}
|
||||
|
||||
func heartbeatLoop(ctx context.Context, conn *websocket.Conn, period time.Duration) {
|
||||
t := time.NewTicker(period)
|
||||
defer t.Stop()
|
||||
|
||||
Reference in New Issue
Block a user