phase 1: WS transport, enrollment, agent that hellos and heartbeats
Lands the protocol layer end-to-end: an agent can be enrolled through the operator UI, store credentials, dial back to the server over WS, complete the protocol_version handshake, and stay connected with periodic heartbeats. Server side: - P1-09 ws.Hub: one Conn per host_id, last-write-wins eviction, json envelope writer with a write mutex, reader, error envelopes. - P1-09 ws.AgentHandler: bearer-auth, accept upgrade, hello-stage (10s deadline, protocol_version checked against api.MinAgentProtocolVersion → ErrProtocolTooOld with help URL on reject), main read loop, defer hub register/unregister. - P1-10 POST /api/agents/enroll consumes a one-time token, mints a persistent agent bearer (sha-256 stored), creates a host row. - P1-10 POST /api/enrollment-tokens (operator, session-auth) issues a 1h one-time token. - P1-11 hello upserts agent_version + restic_version + protocol_version on the host row, flips status to online. - P1-12 heartbeat touches last_seen_at; background sweeper marks hosts offline after 90s without one. - store: hosts table accessors, host_schedule_version, enrollment_tokens FK on consumed_host dropped (audit-only field; the token gets burned before the host row exists). Agent side: - P1-13 internal/agent/config: yaml at /etc/restic-manager/agent.yaml, atomic Save (tmp+fsync+rename), Enrolled() helper. - P1-15 internal/agent/wsclient: dial with bearer + optional TLS cert pinning (sha-256 of leaf), exponential backoff with jitter (1s → 60s cap), heartbeat goroutine, fatal handling for ErrProtocolTooOld. - P1-15 wsclient.Enroll: HTTP POST /api/agents/enroll with sysinfo. - P1-17 internal/agent/sysinfo: hostname/OS/arch/restic-version collection. restic detected by `restic version` parse; absent restic doesn't block startup. - cmd/agent: -enroll-server / -enroll-token flags drive first-run enrollment then exit (so the install script can hand off to systemd to run the persistent service). End-to-end smoke verified: bootstrap → login → issue token → enroll → run agent → server logs `ws agent connected` with the right host_id and protocol_version 1. All tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,205 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CreateHost inserts a new host row. Used by the enrollment flow.
|
||||
// The caller has already minted the host id and hashed the agent
|
||||
// bearer token.
|
||||
func (s *Store) CreateHost(ctx context.Context, h Host, agentTokenHash, certPinSHA256 string) error {
|
||||
tags, err := json.Marshal(h.Tags)
|
||||
if err != nil {
|
||||
return fmt.Errorf("store: marshal tags: %w", err)
|
||||
}
|
||||
_, err = s.db.ExecContext(ctx,
|
||||
`INSERT INTO hosts (
|
||||
id, name, os, arch, agent_version, restic_version, protocol_version,
|
||||
enrolled_at, status, tags,
|
||||
agent_token_hash, cert_pin_sha256
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'offline', ?, ?, ?)`,
|
||||
h.ID, h.Name, h.OS, h.Arch,
|
||||
h.AgentVersion, h.ResticVersion, h.ProtocolVersion,
|
||||
h.EnrolledAt.UTC().Format(time.RFC3339Nano),
|
||||
string(tags),
|
||||
agentTokenHash, certPinSHA256)
|
||||
if err != nil {
|
||||
return fmt.Errorf("store: create host: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// LookupHostByAgentToken resolves a hashed agent bearer token to the
|
||||
// host it belongs to. Returns ErrNotFound on miss.
|
||||
func (s *Store) LookupHostByAgentToken(ctx context.Context, tokenHash string) (*Host, error) {
|
||||
row := s.db.QueryRowContext(ctx,
|
||||
`SELECT id, name, os, arch, agent_version, restic_version, protocol_version,
|
||||
enrolled_at, last_seen_at, status, repo_id, tags,
|
||||
current_job_id, last_backup_at, last_backup_status,
|
||||
repo_size_bytes, snapshot_count, open_alert_count,
|
||||
applied_schedule_version
|
||||
FROM hosts WHERE agent_token_hash = ?`,
|
||||
tokenHash)
|
||||
return scanHost(row)
|
||||
}
|
||||
|
||||
// GetHost returns a host by ID. Returns ErrNotFound on miss.
|
||||
func (s *Store) GetHost(ctx context.Context, id string) (*Host, error) {
|
||||
row := s.db.QueryRowContext(ctx,
|
||||
`SELECT id, name, os, arch, agent_version, restic_version, protocol_version,
|
||||
enrolled_at, last_seen_at, status, repo_id, tags,
|
||||
current_job_id, last_backup_at, last_backup_status,
|
||||
repo_size_bytes, snapshot_count, open_alert_count,
|
||||
applied_schedule_version
|
||||
FROM hosts WHERE id = ?`, id)
|
||||
return scanHost(row)
|
||||
}
|
||||
|
||||
// MarkHostHello updates the host row with metadata received in the
|
||||
// agent's hello message and flips status to 'online'.
|
||||
func (s *Store) MarkHostHello(ctx context.Context, id string, agentVersion, resticVersion string, protoVersion int, when time.Time) error {
|
||||
_, err := s.db.ExecContext(ctx,
|
||||
`UPDATE hosts
|
||||
SET agent_version = ?, restic_version = ?, protocol_version = ?,
|
||||
last_seen_at = ?, status = 'online'
|
||||
WHERE id = ?`,
|
||||
agentVersion, resticVersion, protoVersion,
|
||||
when.UTC().Format(time.RFC3339Nano), id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("store: mark hello: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TouchHost updates last_seen_at on heartbeat, leaving status alone if
|
||||
// already online (the offline-marker is a separate sweep).
|
||||
func (s *Store) TouchHost(ctx context.Context, id string, when time.Time) error {
|
||||
_, err := s.db.ExecContext(ctx,
|
||||
`UPDATE hosts
|
||||
SET last_seen_at = ?,
|
||||
status = CASE WHEN status = 'offline' THEN 'online' ELSE status END
|
||||
WHERE id = ?`,
|
||||
when.UTC().Format(time.RFC3339Nano), id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("store: touch host: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// MarkHostsOfflineStale flips any host that hasn't been seen since
|
||||
// before `cutoff` from 'online' to 'offline'. Returns the number of
|
||||
// rows affected so the caller can log non-zero events.
|
||||
func (s *Store) MarkHostsOfflineStale(ctx context.Context, cutoff time.Time) (int64, error) {
|
||||
res, err := s.db.ExecContext(ctx,
|
||||
`UPDATE hosts
|
||||
SET status = 'offline'
|
||||
WHERE status = 'online'
|
||||
AND (last_seen_at IS NULL OR last_seen_at < ?)`,
|
||||
cutoff.UTC().Format(time.RFC3339Nano))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("store: mark offline: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// ListHosts returns every host. Phase 1 callers fit a small fleet in
|
||||
// memory; pagination lands when it matters.
|
||||
func (s *Store) ListHosts(ctx context.Context) ([]Host, error) {
|
||||
rows, err := s.db.QueryContext(ctx,
|
||||
`SELECT id, name, os, arch, agent_version, restic_version, protocol_version,
|
||||
enrolled_at, last_seen_at, status, repo_id, tags,
|
||||
current_job_id, last_backup_at, last_backup_status,
|
||||
repo_size_bytes, snapshot_count, open_alert_count,
|
||||
applied_schedule_version
|
||||
FROM hosts ORDER BY name`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("store: list hosts: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []Host
|
||||
for rows.Next() {
|
||||
h, err := scanHostRow(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, *h)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// ----- scan helpers --------------------------------------------------
|
||||
|
||||
type hostScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func scanHost(row *sql.Row) (*Host, error) {
|
||||
h, err := scanHostRow(row)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, ErrNotFound
|
||||
}
|
||||
return h, err
|
||||
}
|
||||
|
||||
func scanHostRow(s hostScanner) (*Host, error) {
|
||||
var h Host
|
||||
var (
|
||||
lastSeen, lastBackupAt sql.NullString
|
||||
repoID, currentJob, lastBkSt sql.NullString
|
||||
enrolled string
|
||||
tags string
|
||||
)
|
||||
err := s.Scan(&h.ID, &h.Name, &h.OS, &h.Arch,
|
||||
&h.AgentVersion, &h.ResticVersion, &h.ProtocolVersion,
|
||||
&enrolled, &lastSeen, &h.Status, &repoID, &tags,
|
||||
¤tJob, &lastBackupAt, &lastBkSt,
|
||||
&h.RepoSizeBytes, &h.SnapshotCount, &h.OpenAlertCount,
|
||||
&h.AppliedScheduleVersion)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, ErrNotFound
|
||||
}
|
||||
return nil, fmt.Errorf("store: scan host: %w", err)
|
||||
}
|
||||
t, err := time.Parse(time.RFC3339Nano, enrolled)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("store: parse enrolled_at: %w", err)
|
||||
}
|
||||
h.EnrolledAt = t
|
||||
if lastSeen.Valid {
|
||||
t, err := time.Parse(time.RFC3339Nano, lastSeen.String)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("store: parse last_seen_at: %w", err)
|
||||
}
|
||||
h.LastSeenAt = &t
|
||||
}
|
||||
if lastBackupAt.Valid {
|
||||
t, err := time.Parse(time.RFC3339Nano, lastBackupAt.String)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("store: parse last_backup_at: %w", err)
|
||||
}
|
||||
h.LastBackupAt = &t
|
||||
}
|
||||
if repoID.Valid {
|
||||
s := repoID.String
|
||||
h.RepoID = &s
|
||||
}
|
||||
if currentJob.Valid {
|
||||
s := currentJob.String
|
||||
h.CurrentJobID = &s
|
||||
}
|
||||
if lastBkSt.Valid {
|
||||
s := lastBkSt.String
|
||||
h.LastBackupStatus = &s
|
||||
}
|
||||
if tags != "" {
|
||||
_ = json.Unmarshal([]byte(tags), &h.Tags)
|
||||
}
|
||||
return &h, nil
|
||||
}
|
||||
@@ -92,12 +92,15 @@ CREATE INDEX hosts_status ON hosts(status);
|
||||
CREATE INDEX hosts_last_seen_at ON hosts(last_seen_at);
|
||||
|
||||
-- Pending one-time enrollment tokens (TTL'd, single-use).
|
||||
-- consumed_host is audit-only (no FK on purpose: we burn the token
|
||||
-- before the host row exists, and we want this trail to survive a
|
||||
-- later host deletion).
|
||||
CREATE TABLE enrollment_tokens (
|
||||
token_hash TEXT PRIMARY KEY, -- argon2id of token
|
||||
token_hash TEXT PRIMARY KEY,
|
||||
created_at TEXT NOT NULL,
|
||||
expires_at TEXT NOT NULL,
|
||||
consumed_at TEXT,
|
||||
consumed_host TEXT REFERENCES hosts(id) ON DELETE SET NULL
|
||||
consumed_host TEXT
|
||||
);
|
||||
CREATE INDEX enrollment_tokens_expires_at ON enrollment_tokens(expires_at);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user