7cc17813a9
Single public deliverable per tag: a multi-arch server image, with cross-compiled agent binaries + install scripts + the systemd unit baked under /opt/restic-manager/dist/. The /agent/binary and /install/* handlers fall back from <DataDir>/... to that read-only path so a fresh container Just Works without first-run staging; operators can still drop a custom build into <DataDir>/ to override per-host. Architecture rationale: agent distribution already routes through the running server, so the release surface mirrors that — there's no second source of truth to keep in sync. Workflow .gitea/workflows/release.yml triggers on v*.*.* tag-push (fan-out :vX.Y.Z / :X.Y / :X, plus :latest once MAJOR>=1) and workflow_dispatch (snapshot tag only). Pushes to the Gitea container registry on this instance. Both binaries grow main.commit + main.date ldflag targets. Makefile and Dockerfile fill them; release workflow forwards from gitea.sha plus a UTC timestamp. Spec : docs/superpowers/specs/2026-05-05-p5-03-docker-only-release.md Plan : docs/superpowers/plans/2026-05-05-p5-03-docker-only-release.md
246 lines
7.7 KiB
Go
246 lines
7.7 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"os/signal"
|
|
"path/filepath"
|
|
"syscall"
|
|
"time"
|
|
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/alert"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
|
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
)
|
|
|
|
var (
|
|
version = "dev"
|
|
commit = "none"
|
|
date = "unknown"
|
|
)
|
|
|
|
func main() {
|
|
if err := run(); err != nil {
|
|
slog.Error("server fatal", "err", err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
func run() error {
|
|
configPath := flag.String("config", "", "path to YAML config (optional; env vars win regardless)")
|
|
showVersion := flag.Bool("version", false, "print version and exit")
|
|
flag.Parse()
|
|
|
|
if *showVersion {
|
|
fmt.Printf("restic-manager-server %s (commit %s, built %s)\n", version, commit, date)
|
|
return nil
|
|
}
|
|
|
|
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo}))
|
|
slog.SetDefault(logger)
|
|
|
|
cfg, err := config.Load(*configPath)
|
|
if err != nil {
|
|
return fmt.Errorf("config: %w", err)
|
|
}
|
|
slog.Info("config resolved", "listen", cfg.Listen, "data_dir", cfg.DataDir,
|
|
"cookie_secure", cfg.CookieSecure, "trusted_proxies", cfg.TrustedProxies)
|
|
|
|
if err := os.MkdirAll(cfg.DataDir, 0o700); err != nil {
|
|
return fmt.Errorf("ensure data dir: %w", err)
|
|
}
|
|
|
|
// Mint or load the encryption key.
|
|
if _, err := os.Stat(cfg.SecretKeyFile); errors.Is(err, os.ErrNotExist) {
|
|
slog.Warn("no secret key found; generating a new one — back this up before continuing",
|
|
"path", cfg.SecretKeyFile)
|
|
if err := crypto.GenerateKeyFile(cfg.SecretKeyFile); err != nil {
|
|
return fmt.Errorf("generate secret key: %w", err)
|
|
}
|
|
}
|
|
keyBytes, err := crypto.LoadKeyFromFile(cfg.SecretKeyFile)
|
|
if err != nil {
|
|
return fmt.Errorf("load secret key: %w", err)
|
|
}
|
|
aead, err := crypto.NewAEAD(keyBytes)
|
|
if err != nil {
|
|
return fmt.Errorf("init AEAD: %w", err)
|
|
}
|
|
|
|
dbPath := filepath.Join(cfg.DataDir, "restic-manager.db")
|
|
st, err := store.Open(context.Background(), dbPath)
|
|
if err != nil {
|
|
return fmt.Errorf("open store: %w", err)
|
|
}
|
|
defer func() { _ = st.Close() }()
|
|
|
|
hub := ws.NewHub()
|
|
jobHub := ws.NewJobHub()
|
|
|
|
notifHub := notification.NewHub(st, aead, cfg.BaseURL)
|
|
alertEngine := alert.NewEngine(st, notifHub)
|
|
|
|
renderer, err := ui.New()
|
|
if err != nil {
|
|
return fmt.Errorf("ui: %w", err)
|
|
}
|
|
|
|
var oidcClient *oidc.Client
|
|
if cfg.OIDC != nil {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
oidcClient, err = oidc.New(ctx, cfg.OIDC, cfg.BaseURL)
|
|
if err != nil {
|
|
return fmt.Errorf("oidc: %w", err)
|
|
}
|
|
slog.Info("oidc enabled", "issuer", cfg.OIDC.Issuer, "display", cfg.OIDC.DisplayName)
|
|
}
|
|
|
|
deps := rmhttp.Deps{
|
|
Cfg: cfg,
|
|
Store: st,
|
|
AEAD: aead,
|
|
Hub: hub,
|
|
JobHub: jobHub,
|
|
AlertEngine: alertEngine,
|
|
NotificationHub: notifHub,
|
|
UI: renderer,
|
|
Version: version,
|
|
OIDC: oidcClient,
|
|
}
|
|
|
|
// First-run bootstrap: if the users table is empty, mint a one-time
|
|
// token and print it. /api/bootstrap accepts it to create the first
|
|
// admin user, then becomes a no-op.
|
|
n, err := st.CountUsers(context.Background())
|
|
if err != nil {
|
|
return fmt.Errorf("count users: %w", err)
|
|
}
|
|
if n == 0 {
|
|
token, err := auth.NewToken()
|
|
if err != nil {
|
|
return fmt.Errorf("mint bootstrap token: %w", err)
|
|
}
|
|
deps.BootstrapToken = token
|
|
// Stable, easy-to-grep marker so an operator finds this in
|
|
// scrolling logs without spelunking. Token is shown in plain
|
|
// text exactly once; we hash it into BootstrapToken on the
|
|
// server-side handler.
|
|
fmt.Fprintln(os.Stderr, "================================================================")
|
|
fmt.Fprintln(os.Stderr, " FIRST RUN — bootstrap token (use within 1 hour, then it's gone):")
|
|
fmt.Fprintln(os.Stderr, " "+token)
|
|
fmt.Fprintln(os.Stderr, " POST it to /api/bootstrap with {token, username, password}.")
|
|
fmt.Fprintln(os.Stderr, "================================================================")
|
|
}
|
|
|
|
srv := rmhttp.New(deps)
|
|
|
|
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
defer stop()
|
|
|
|
go alertEngine.Run(ctx)
|
|
|
|
errCh := make(chan error, 1)
|
|
go func() {
|
|
slog.Info("server listening", "addr", cfg.Listen, "version", version)
|
|
errCh <- srv.Start()
|
|
}()
|
|
|
|
// Background sweepers:
|
|
// - sessions/tokens purge: 15 min
|
|
// - host offline-after-90s mark: every 30s (matches heartbeat
|
|
// cadence — agent sends every 30s, P1-12)
|
|
purgeTick := time.NewTicker(15 * time.Minute)
|
|
defer purgeTick.Stop()
|
|
offlineTick := time.NewTicker(30 * time.Second)
|
|
defer offlineTick.Stop()
|
|
// Maintenance ticker: drives forget/prune/check on the cadences
|
|
// operators set per-host. Independent of the agent's local cron
|
|
// (which only handles backup schedules). 60s cadence — the cron
|
|
// expressions are minute-grained, so anything finer is wasted
|
|
// work.
|
|
maintenanceTick := time.NewTicker(60 * time.Second)
|
|
defer maintenanceTick.Stop()
|
|
// Pending-runs drain ticker: 30s cadence sweeps every host with
|
|
// pending_runs rows whose next_attempt_at <= now (rows accumulate
|
|
// when a schedule.fire's command.run send fails because the agent
|
|
// dropped offline mid-flight). The on-reconnect path in
|
|
// onAgentHello handles the common case; this ticker is the
|
|
// safety-net for hosts that come back without a fresh hello (they
|
|
// shouldn't, but the queue exists either way).
|
|
pendingDrainTick := time.NewTicker(30 * time.Second)
|
|
defer pendingDrainTick.Stop()
|
|
// Pending-hosts expiry sweeper: drops announce rows past their 1h
|
|
// ceiling so the dashboard panel doesn't accumulate stale entries.
|
|
pendingExpiryTick := time.NewTicker(60 * time.Second)
|
|
defer pendingExpiryTick.Stop()
|
|
mt := maintenance.New(st)
|
|
go func() {
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-purgeTick.C:
|
|
if n, err := st.PurgeExpiredSessions(ctx); err == nil && n > 0 {
|
|
slog.Info("purged expired sessions", "n", n)
|
|
}
|
|
if n, err := st.PurgeExpiredEnrollmentTokens(ctx); err == nil && n > 0 {
|
|
slog.Info("purged expired enrollment tokens", "n", n)
|
|
}
|
|
case <-offlineTick.C:
|
|
cutoff := time.Now().Add(-90 * time.Second)
|
|
if ids, err := st.MarkHostsOfflineStaleReturnIDs(ctx, cutoff); err == nil && len(ids) > 0 {
|
|
slog.Info("marked hosts offline (stale heartbeat)", "n", len(ids))
|
|
for _, id := range ids {
|
|
alertEngine.NotifyHostOffline(id)
|
|
}
|
|
}
|
|
case <-pendingDrainTick.C:
|
|
srv.DrainAllDue(ctx)
|
|
case <-pendingExpiryTick.C:
|
|
if n, err := st.DeleteExpiredPendingHosts(ctx, time.Now().UTC()); err == nil && n > 0 {
|
|
slog.Info("expired pending hosts swept", "n", n)
|
|
}
|
|
case <-maintenanceTick.C:
|
|
decisions, err := mt.Decide(ctx, time.Now().UTC())
|
|
if err != nil {
|
|
slog.Warn("maintenance ticker: decide", "err", err)
|
|
continue
|
|
}
|
|
if len(decisions) > 0 {
|
|
slog.Info("maintenance ticker: dispatching", "n", len(decisions))
|
|
srv.DispatchMaintenance(ctx, decisions)
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
|
|
select {
|
|
case err := <-errCh:
|
|
if err != nil {
|
|
return fmt.Errorf("listen: %w", err)
|
|
}
|
|
case <-ctx.Done():
|
|
slog.Info("shutting down")
|
|
}
|
|
|
|
shutCtx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
|
defer cancel()
|
|
if err := srv.Shutdown(shutCtx); err != nil {
|
|
return fmt.Errorf("shutdown: %w", err)
|
|
}
|
|
return nil
|
|
}
|