Files
restic-manager/internal/store/hosts.go
T
steve c8ead66f08 P1 polish: agent-as-root, init-repo flow, rest creds passthrough, UX fixes
Cohesive batch from a smoke-test session against a real rest-server.
Themed bullets:

* Agent runs as root, sandboxed via systemd. CapabilityBoundingSet
  drops to CAP_DAC_READ_SEARCH + restore caps; ProtectSystem=strict
  with ReadWritePaths confined to /etc + /var/lib/restic-manager;
  NoNewPrivileges blocks escalation. Install script no longer
  creates a service user. spec.md §4.2 / §14.1 / §14.3 explain the
  rationale (matches UrBackup / Veeam / Bareos defaults; trying to
  back up "everything" as an unprivileged user creates silent skips
  on /home, /root, /var/lib/* with no upside vs the threat model
  the agent already implies).

* Init-repo end-to-end. New JobKind="init" wired through agent
  runner, restic.Env.RunInit, server dispatcher, and a UI button
  (red "Initialise repo" in the run-now panel). hosts.repo_initialised_at
  flips on init success, on backup success, or on a non-empty
  snapshots.report. The "Run now" / "Init" / "Retry" branching now
  drives both the dashboard host row and the host-detail panel.
  Migrations 0004 (column), 0005 (jobs.kind CHECK widened — using
  the safe create-new-then-rename pattern; first version corrupted
  job_logs.job_id FK), 0006 (cleans up job_logs FK on already-
  affected DBs).

* rest-server creds embedded at exec time only. restic.Env gains
  RepoUsername; mergeRestCreds() builds the user:pass@-prefixed URL
  inside envSlice() and never assigns it back to the struct, so
  nothing slog-able ever sees the cleartext form. RedactURL helper
  for any future surface that needs to log a URL safely. Both
  helpers tested.

* Add-host UX. Repo password is now optional — server mints a
  24-byte URL-safe random one and surfaces it once, alongside an
  htpasswd snippet ("echo PASS | htpasswd -B -i ... USERNAME") so
  the operator pastes one command on the rest-server host and one
  on the endpoint. Result page also links the install snippet at
  /install/install.sh (was /install.sh — 404'd before) and pipes
  to bash (not sh — script uses set -o pipefail and other
  bashisms; on Debian/Ubuntu sh is dash).

* Late-subscriber race in JobHub. A fast-failing job could finish
  (DB write + Broadcast) before the browser's HX-Redirect → page
  load → WS-connect path completed, so the JS sat forever waiting
  on a job.finished that already passed. JobHub split into
  Register + Send + Run; handleJobStream now subscribes first,
  re-fetches the job, and sends a synthetic job.finished if the
  state is already terminal.

* HTMX error visibility. New toast partial listens to
  htmx:responseError and surfaces the response body as a
  bottom-right toast — every server-side validation error now
  becomes visible without per-handler JS wiring. Also handles
  custom rm:toast events for future server-pushed notifications
  via the HX-Trigger header. Themed via existing CSS vars.

* Dashboard rows are now whole-row clickable to host detail
  (CSS card-link pattern: absolute-positioned anchor + .row-action
  z-index restoration so the action button stays clickable).
  "View →" on a running job links to /jobs/<id> rather than
  /hosts/<id> since the row click already covers the host page.

* "Run first" / "Run first backup" → "Run now" everywhere for
  consistency.

* runbook (docs/e2e-smoke.md) updated — live-log streaming step
  now reflects P1-26; mentions the browser-driven Run-now flow.

* _diag/dump-creds — moved out of cmd/ so go build doesn't pick
  it up; .gitignore now excludes /_diag/ entirely.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 11:02:12 +01:00

242 lines
7.6 KiB
Go

package store
import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"time"
)
// CreateHost inserts a new host row. Used by the enrollment flow.
// The caller has already minted the host id and hashed the agent
// bearer token.
func (s *Store) CreateHost(ctx context.Context, h Host, agentTokenHash, certPinSHA256 string) error {
tags, err := json.Marshal(h.Tags)
if err != nil {
return fmt.Errorf("store: marshal tags: %w", err)
}
if h.DefaultPaths == nil {
h.DefaultPaths = []string{}
}
defaultPaths, err := json.Marshal(h.DefaultPaths)
if err != nil {
return fmt.Errorf("store: marshal default_paths: %w", err)
}
_, err = s.db.ExecContext(ctx,
`INSERT INTO hosts (
id, name, os, arch, agent_version, restic_version, protocol_version,
enrolled_at, status, tags,
agent_token_hash, cert_pin_sha256, default_paths
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'offline', ?, ?, ?, ?)`,
h.ID, h.Name, h.OS, h.Arch,
h.AgentVersion, h.ResticVersion, h.ProtocolVersion,
h.EnrolledAt.UTC().Format(time.RFC3339Nano),
string(tags),
agentTokenHash, certPinSHA256,
string(defaultPaths))
if err != nil {
return fmt.Errorf("store: create host: %w", err)
}
return nil
}
// LookupHostByAgentToken resolves a hashed agent bearer token to the
// host it belongs to. Returns ErrNotFound on miss.
func (s *Store) LookupHostByAgentToken(ctx context.Context, tokenHash string) (*Host, error) {
row := s.db.QueryRowContext(ctx,
`SELECT id, name, os, arch, agent_version, restic_version, protocol_version,
enrolled_at, last_seen_at, status, repo_id, tags,
current_job_id, last_backup_at, last_backup_status,
repo_size_bytes, snapshot_count, open_alert_count,
applied_schedule_version, default_paths, repo_initialised_at
FROM hosts WHERE agent_token_hash = ?`,
tokenHash)
return scanHost(row)
}
// GetHost returns a host by ID. Returns ErrNotFound on miss.
func (s *Store) GetHost(ctx context.Context, id string) (*Host, error) {
row := s.db.QueryRowContext(ctx,
`SELECT id, name, os, arch, agent_version, restic_version, protocol_version,
enrolled_at, last_seen_at, status, repo_id, tags,
current_job_id, last_backup_at, last_backup_status,
repo_size_bytes, snapshot_count, open_alert_count,
applied_schedule_version, default_paths, repo_initialised_at
FROM hosts WHERE id = ?`, id)
return scanHost(row)
}
// MarkHostHello updates the host row with metadata received in the
// agent's hello message and flips status to 'online'.
func (s *Store) MarkHostHello(ctx context.Context, id string, agentVersion, resticVersion string, protoVersion int, when time.Time) error {
_, err := s.db.ExecContext(ctx,
`UPDATE hosts
SET agent_version = ?, restic_version = ?, protocol_version = ?,
last_seen_at = ?, status = 'online'
WHERE id = ?`,
agentVersion, resticVersion, protoVersion,
when.UTC().Format(time.RFC3339Nano), id)
if err != nil {
return fmt.Errorf("store: mark hello: %w", err)
}
return nil
}
// TouchHost updates last_seen_at on heartbeat, leaving status alone if
// already online (the offline-marker is a separate sweep).
func (s *Store) TouchHost(ctx context.Context, id string, when time.Time) error {
_, err := s.db.ExecContext(ctx,
`UPDATE hosts
SET last_seen_at = ?,
status = CASE WHEN status = 'offline' THEN 'online' ELSE status END
WHERE id = ?`,
when.UTC().Format(time.RFC3339Nano), id)
if err != nil {
return fmt.Errorf("store: touch host: %w", err)
}
return nil
}
// MarkHostsOfflineStale flips any host that hasn't been seen since
// before `cutoff` from 'online' to 'offline'. Returns the number of
// rows affected so the caller can log non-zero events.
func (s *Store) MarkHostsOfflineStale(ctx context.Context, cutoff time.Time) (int64, error) {
res, err := s.db.ExecContext(ctx,
`UPDATE hosts
SET status = 'offline'
WHERE status = 'online'
AND (last_seen_at IS NULL OR last_seen_at < ?)`,
cutoff.UTC().Format(time.RFC3339Nano))
if err != nil {
return 0, fmt.Errorf("store: mark offline: %w", err)
}
n, _ := res.RowsAffected()
return n, nil
}
// ListHosts returns every host. Phase 1 callers fit a small fleet in
// memory; pagination lands when it matters.
func (s *Store) ListHosts(ctx context.Context) ([]Host, error) {
rows, err := s.db.QueryContext(ctx,
`SELECT id, name, os, arch, agent_version, restic_version, protocol_version,
enrolled_at, last_seen_at, status, repo_id, tags,
current_job_id, last_backup_at, last_backup_status,
repo_size_bytes, snapshot_count, open_alert_count,
applied_schedule_version, default_paths, repo_initialised_at
FROM hosts ORDER BY name`)
if err != nil {
return nil, fmt.Errorf("store: list hosts: %w", err)
}
defer rows.Close()
var out []Host
for rows.Next() {
h, err := scanHostRow(rows)
if err != nil {
return nil, err
}
out = append(out, *h)
}
return out, rows.Err()
}
// ----- scan helpers --------------------------------------------------
type hostScanner interface {
Scan(dest ...any) error
}
func scanHost(row *sql.Row) (*Host, error) {
h, err := scanHostRow(row)
if errors.Is(err, sql.ErrNoRows) {
return nil, ErrNotFound
}
return h, err
}
func scanHostRow(s hostScanner) (*Host, error) {
var h Host
var (
lastSeen, lastBackupAt sql.NullString
repoID, currentJob, lastBkSt sql.NullString
enrolled string
tags string
defaultPaths string
repoInitAt sql.NullString
)
err := s.Scan(&h.ID, &h.Name, &h.OS, &h.Arch,
&h.AgentVersion, &h.ResticVersion, &h.ProtocolVersion,
&enrolled, &lastSeen, &h.Status, &repoID, &tags,
&currentJob, &lastBackupAt, &lastBkSt,
&h.RepoSizeBytes, &h.SnapshotCount, &h.OpenAlertCount,
&h.AppliedScheduleVersion, &defaultPaths, &repoInitAt)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, ErrNotFound
}
return nil, fmt.Errorf("store: scan host: %w", err)
}
t, err := time.Parse(time.RFC3339Nano, enrolled)
if err != nil {
return nil, fmt.Errorf("store: parse enrolled_at: %w", err)
}
h.EnrolledAt = t
if lastSeen.Valid {
t, err := time.Parse(time.RFC3339Nano, lastSeen.String)
if err != nil {
return nil, fmt.Errorf("store: parse last_seen_at: %w", err)
}
h.LastSeenAt = &t
}
if lastBackupAt.Valid {
t, err := time.Parse(time.RFC3339Nano, lastBackupAt.String)
if err != nil {
return nil, fmt.Errorf("store: parse last_backup_at: %w", err)
}
h.LastBackupAt = &t
}
if repoID.Valid {
s := repoID.String
h.RepoID = &s
}
if currentJob.Valid {
s := currentJob.String
h.CurrentJobID = &s
}
if lastBkSt.Valid {
s := lastBkSt.String
h.LastBackupStatus = &s
}
if tags != "" {
_ = json.Unmarshal([]byte(tags), &h.Tags)
}
if defaultPaths != "" {
_ = json.Unmarshal([]byte(defaultPaths), &h.DefaultPaths)
}
if repoInitAt.Valid {
t, err := time.Parse(time.RFC3339Nano, repoInitAt.String)
if err != nil {
return nil, fmt.Errorf("store: parse repo_initialised_at: %w", err)
}
h.RepoInitialisedAt = &t
}
return &h, nil
}
// MarkHostRepoInitialised sets repo_initialised_at to `when` if it is
// currently NULL. Idempotent: re-firing for an already-initialised
// host is a no-op (we never want to clobber the original timestamp).
// Returns true if the row was updated, false if it was already set.
func (s *Store) MarkHostRepoInitialised(ctx context.Context, hostID string, when time.Time) (bool, error) {
res, err := s.db.ExecContext(ctx,
`UPDATE hosts SET repo_initialised_at = ?
WHERE id = ? AND repo_initialised_at IS NULL`,
when.UTC().Format(time.RFC3339Nano), hostID)
if err != nil {
return false, fmt.Errorf("store: mark repo initialised: %w", err)
}
n, _ := res.RowsAffected()
return n > 0, nil
}