Files
restic-manager/cmd/server/main.go
T
steve 9cc0caff1e phase 1: WS transport, enrollment, agent that hellos and heartbeats
Lands the protocol layer end-to-end: an agent can be enrolled
through the operator UI, store credentials, dial back to the server
over WS, complete the protocol_version handshake, and stay
connected with periodic heartbeats.

Server side:
- P1-09 ws.Hub: one Conn per host_id, last-write-wins eviction,
  json envelope writer with a write mutex, reader, error envelopes.
- P1-09 ws.AgentHandler: bearer-auth, accept upgrade, hello-stage
  (10s deadline, protocol_version checked against
  api.MinAgentProtocolVersion → ErrProtocolTooOld with help URL on
  reject), main read loop, defer hub register/unregister.
- P1-10 POST /api/agents/enroll consumes a one-time token, mints a
  persistent agent bearer (sha-256 stored), creates a host row.
- P1-10 POST /api/enrollment-tokens (operator, session-auth)
  issues a 1h one-time token.
- P1-11 hello upserts agent_version + restic_version +
  protocol_version on the host row, flips status to online.
- P1-12 heartbeat touches last_seen_at; background sweeper marks
  hosts offline after 90s without one.
- store: hosts table accessors, host_schedule_version,
  enrollment_tokens FK on consumed_host dropped (audit-only field;
  the token gets burned before the host row exists).

Agent side:
- P1-13 internal/agent/config: yaml at /etc/restic-manager/agent.yaml,
  atomic Save (tmp+fsync+rename), Enrolled() helper.
- P1-15 internal/agent/wsclient: dial with bearer + optional
  TLS cert pinning (sha-256 of leaf), exponential backoff with
  jitter (1s → 60s cap), heartbeat goroutine, fatal handling for
  ErrProtocolTooOld.
- P1-15 wsclient.Enroll: HTTP POST /api/agents/enroll with sysinfo.
- P1-17 internal/agent/sysinfo: hostname/OS/arch/restic-version
  collection. restic detected by `restic version` parse; absent
  restic doesn't block startup.
- cmd/agent: -enroll-server / -enroll-token flags drive first-run
  enrollment then exit (so the install script can hand off to
  systemd to run the persistent service).

End-to-end smoke verified: bootstrap → login → issue token →
enroll → run agent → server logs `ws agent connected` with the
right host_id and protocol_version 1.

All tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 00:39:00 +01:00

169 lines
4.8 KiB
Go

package main
import (
"context"
"errors"
"flag"
"fmt"
"log/slog"
"os"
"os/signal"
"path/filepath"
"syscall"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
var version = "dev"
func main() {
if err := run(); err != nil {
slog.Error("server fatal", "err", err)
os.Exit(1)
}
}
func run() error {
configPath := flag.String("config", "", "path to YAML config (optional; env vars win regardless)")
showVersion := flag.Bool("version", false, "print version and exit")
flag.Parse()
if *showVersion {
fmt.Println("restic-manager-server", version)
return nil
}
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo}))
slog.SetDefault(logger)
cfg, err := config.Load(*configPath)
if err != nil {
return fmt.Errorf("config: %w", err)
}
slog.Info("config resolved", "listen", cfg.Listen, "data_dir", cfg.DataDir,
"tls", cfg.TLSEnabled(), "trusted_proxies", cfg.TrustedProxies)
if err := os.MkdirAll(cfg.DataDir, 0o700); err != nil {
return fmt.Errorf("ensure data dir: %w", err)
}
// Mint or load the encryption key.
if _, err := os.Stat(cfg.SecretKeyFile); errors.Is(err, os.ErrNotExist) {
slog.Warn("no secret key found; generating a new one — back this up before continuing",
"path", cfg.SecretKeyFile)
if err := crypto.GenerateKeyFile(cfg.SecretKeyFile); err != nil {
return fmt.Errorf("generate secret key: %w", err)
}
}
keyBytes, err := crypto.LoadKeyFromFile(cfg.SecretKeyFile)
if err != nil {
return fmt.Errorf("load secret key: %w", err)
}
aead, err := crypto.NewAEAD(keyBytes)
if err != nil {
return fmt.Errorf("init AEAD: %w", err)
}
dbPath := filepath.Join(cfg.DataDir, "restic-manager.db")
st, err := store.Open(context.Background(), dbPath)
if err != nil {
return fmt.Errorf("open store: %w", err)
}
defer func() { _ = st.Close() }()
hub := ws.NewHub()
deps := rmhttp.Deps{
Cfg: cfg,
Store: st,
AEAD: aead,
Hub: hub,
}
// First-run bootstrap: if the users table is empty, mint a one-time
// token and print it. /api/bootstrap accepts it to create the first
// admin user, then becomes a no-op.
n, err := st.CountUsers(context.Background())
if err != nil {
return fmt.Errorf("count users: %w", err)
}
if n == 0 {
token, err := auth.NewToken()
if err != nil {
return fmt.Errorf("mint bootstrap token: %w", err)
}
deps.BootstrapToken = token
// Stable, easy-to-grep marker so an operator finds this in
// scrolling logs without spelunking. Token is shown in plain
// text exactly once; we hash it into BootstrapToken on the
// server-side handler.
fmt.Fprintln(os.Stderr, "================================================================")
fmt.Fprintln(os.Stderr, " FIRST RUN — bootstrap token (use within 1 hour, then it's gone):")
fmt.Fprintln(os.Stderr, " "+token)
fmt.Fprintln(os.Stderr, " POST it to /api/bootstrap with {token, username, password}.")
fmt.Fprintln(os.Stderr, "================================================================")
}
srv := rmhttp.New(deps)
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
errCh := make(chan error, 1)
go func() {
slog.Info("server listening", "addr", cfg.Listen, "version", version)
errCh <- srv.Start()
}()
// Background sweepers:
// - sessions/tokens purge: 15 min
// - host offline-after-90s mark: every 30s (matches heartbeat
// cadence — agent sends every 30s, P1-12)
purgeTick := time.NewTicker(15 * time.Minute)
defer purgeTick.Stop()
offlineTick := time.NewTicker(30 * time.Second)
defer offlineTick.Stop()
go func() {
for {
select {
case <-ctx.Done():
return
case <-purgeTick.C:
if n, err := st.PurgeExpiredSessions(ctx); err == nil && n > 0 {
slog.Info("purged expired sessions", "n", n)
}
if n, err := st.PurgeExpiredEnrollmentTokens(ctx); err == nil && n > 0 {
slog.Info("purged expired enrollment tokens", "n", n)
}
case <-offlineTick.C:
cutoff := time.Now().Add(-90 * time.Second)
if n, err := st.MarkHostsOfflineStale(ctx, cutoff); err == nil && n > 0 {
slog.Info("marked hosts offline (stale heartbeat)", "n", n)
}
}
}
}()
select {
case err := <-errCh:
if err != nil {
return fmt.Errorf("listen: %w", err)
}
case <-ctx.Done():
slog.Info("shutting down")
}
shutCtx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
if err := srv.Shutdown(shutCtx); err != nil {
return fmt.Errorf("shutdown: %w", err)
}
return nil
}