9ec69456fe
Widen the SQL query to consider all statuses (queued, running, succeeded, failed, cancelled) rather than terminal-only. An in-flight prune that outlasts the 60s tick interval previously produced ErrNotFound, causing the ticker to anchor at now-24h and fire a second prune concurrently with the first. Update the doc comment and test: remove the "queued job filtered out" case, add assertions that a running job and a queued job are each returned as the latest.
285 lines
8.1 KiB
Go
285 lines
8.1 KiB
Go
package store
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"time"
|
|
)
|
|
|
|
// Job mirrors the jobs table.
|
|
type Job struct {
|
|
ID string
|
|
HostID string
|
|
Kind string
|
|
Status string
|
|
ScheduledID *string
|
|
ActorKind string // user|schedule|system
|
|
ActorID *string
|
|
StartedAt *time.Time
|
|
FinishedAt *time.Time
|
|
ExitCode *int
|
|
Stats json.RawMessage
|
|
Error *string
|
|
CreatedAt time.Time
|
|
}
|
|
|
|
// CreateJob inserts a queued job. The agent will mark it running
|
|
// when it actually starts work. ScheduledID is set when the job
|
|
// originates from a cron fire (actor_kind="schedule"); nil for
|
|
// operator-driven run-now.
|
|
func (s *Store) CreateJob(ctx context.Context, j Job) error {
|
|
_, err := s.db.ExecContext(ctx,
|
|
`INSERT INTO jobs (id, host_id, kind, status, scheduled_id, actor_kind, actor_id, created_at)
|
|
VALUES (?, ?, ?, 'queued', ?, ?, ?, ?)`,
|
|
j.ID, j.HostID, j.Kind,
|
|
nullable(j.ScheduledID), j.ActorKind, nullable(j.ActorID),
|
|
j.CreatedAt.UTC().Format(time.RFC3339Nano))
|
|
if err != nil {
|
|
return fmt.Errorf("store: create job: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// MarkJobStarted flips status to 'running' and records started_at.
|
|
func (s *Store) MarkJobStarted(ctx context.Context, id string, when time.Time) error {
|
|
res, err := s.db.ExecContext(ctx,
|
|
`UPDATE jobs
|
|
SET status = 'running', started_at = ?
|
|
WHERE id = ? AND status IN ('queued','running')`,
|
|
when.UTC().Format(time.RFC3339Nano), id)
|
|
if err != nil {
|
|
return fmt.Errorf("store: mark started: %w", err)
|
|
}
|
|
n, _ := res.RowsAffected()
|
|
if n == 0 {
|
|
return ErrNotFound
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// MarkJobFinished records the terminal state.
|
|
func (s *Store) MarkJobFinished(ctx context.Context, id, status string, exitCode int, stats json.RawMessage, errMsg string, when time.Time) error {
|
|
if len(stats) == 0 {
|
|
stats = json.RawMessage("null")
|
|
}
|
|
res, err := s.db.ExecContext(ctx,
|
|
`UPDATE jobs
|
|
SET status = ?, finished_at = ?, exit_code = ?, stats = ?, error = ?
|
|
WHERE id = ?`,
|
|
status,
|
|
when.UTC().Format(time.RFC3339Nano),
|
|
exitCode, string(stats), nullableStr(errMsg), id)
|
|
if err != nil {
|
|
return fmt.Errorf("store: mark finished: %w", err)
|
|
}
|
|
n, _ := res.RowsAffected()
|
|
if n == 0 {
|
|
return ErrNotFound
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// AppendJobLog records one line of agent output. seq is the agent's
|
|
// monotonic sequence number; gaps imply lost data.
|
|
func (s *Store) AppendJobLog(ctx context.Context, jobID string, seq int64, ts time.Time, stream, payload string) error {
|
|
_, err := s.db.ExecContext(ctx,
|
|
`INSERT INTO job_logs (job_id, seq, ts, stream, payload) VALUES (?,?,?,?,?)`,
|
|
jobID, seq, ts.UTC().Format(time.RFC3339Nano), stream, payload)
|
|
if err != nil {
|
|
return fmt.Errorf("store: append job log: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// JobLogLine is one persisted log line, ready to render.
|
|
type JobLogLine struct {
|
|
Seq int64
|
|
TS time.Time
|
|
Stream string // stdout|stderr|event
|
|
Payload string
|
|
}
|
|
|
|
// ListJobLogs returns persisted log lines for a job in seq order.
|
|
// afterSeq lets pagers / reconnect-resuming clients fetch only the
|
|
// tail; passing 0 returns from the beginning. limit caps the result
|
|
// (0 means no cap).
|
|
func (s *Store) ListJobLogs(ctx context.Context, jobID string, afterSeq int64, limit int) ([]JobLogLine, error) {
|
|
q := `SELECT seq, ts, stream, payload FROM job_logs
|
|
WHERE job_id = ? AND seq > ? ORDER BY seq ASC`
|
|
args := []any{jobID, afterSeq}
|
|
if limit > 0 {
|
|
q += ` LIMIT ?`
|
|
args = append(args, limit)
|
|
}
|
|
rows, err := s.db.QueryContext(ctx, q, args...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("store: list job logs: %w", err)
|
|
}
|
|
defer func() { _ = rows.Close() }()
|
|
var out []JobLogLine
|
|
for rows.Next() {
|
|
var l JobLogLine
|
|
var ts string
|
|
if err := rows.Scan(&l.Seq, &ts, &l.Stream, &l.Payload); err != nil {
|
|
return nil, fmt.Errorf("store: scan job log: %w", err)
|
|
}
|
|
t, perr := time.Parse(time.RFC3339Nano, ts)
|
|
if perr != nil {
|
|
return nil, fmt.Errorf("store: parse job log ts: %w", perr)
|
|
}
|
|
l.TS = t
|
|
out = append(out, l)
|
|
}
|
|
return out, rows.Err()
|
|
}
|
|
|
|
// GetJob returns a job row.
|
|
func (s *Store) GetJob(ctx context.Context, id string) (*Job, error) {
|
|
row := s.db.QueryRowContext(ctx,
|
|
`SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id,
|
|
started_at, finished_at, exit_code, stats, error, created_at
|
|
FROM jobs WHERE id = ?`, id)
|
|
var (
|
|
j Job
|
|
schedID sql.NullString
|
|
actorID sql.NullString
|
|
startedAt sql.NullString
|
|
finishedAt sql.NullString
|
|
exitCode sql.NullInt64
|
|
stats sql.NullString
|
|
errMsg sql.NullString
|
|
createdAt string
|
|
)
|
|
if err := row.Scan(&j.ID, &j.HostID, &j.Kind, &j.Status, &schedID,
|
|
&j.ActorKind, &actorID, &startedAt, &finishedAt,
|
|
&exitCode, &stats, &errMsg, &createdAt); err != nil {
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
return nil, ErrNotFound
|
|
}
|
|
return nil, fmt.Errorf("store: scan job: %w", err)
|
|
}
|
|
if schedID.Valid {
|
|
s := schedID.String
|
|
j.ScheduledID = &s
|
|
}
|
|
if actorID.Valid {
|
|
s := actorID.String
|
|
j.ActorID = &s
|
|
}
|
|
if startedAt.Valid {
|
|
t, _ := time.Parse(time.RFC3339Nano, startedAt.String)
|
|
j.StartedAt = &t
|
|
}
|
|
if finishedAt.Valid {
|
|
t, _ := time.Parse(time.RFC3339Nano, finishedAt.String)
|
|
j.FinishedAt = &t
|
|
}
|
|
if exitCode.Valid {
|
|
i := int(exitCode.Int64)
|
|
j.ExitCode = &i
|
|
}
|
|
if stats.Valid && stats.String != "" {
|
|
j.Stats = json.RawMessage(stats.String)
|
|
}
|
|
if errMsg.Valid {
|
|
s := errMsg.String
|
|
j.Error = &s
|
|
}
|
|
t, _ := time.Parse(time.RFC3339Nano, createdAt)
|
|
j.CreatedAt = t
|
|
return &j, nil
|
|
}
|
|
|
|
// LatestJobByKind returns the most recent job (any status, including
|
|
// queued and running) of the given kind for the host, or
|
|
// (nil, ErrNotFound) if no such job exists. Used by the maintenance
|
|
// ticker to compute "last fire" anchors for the cron-due check;
|
|
// in-flight jobs MUST be considered or a long-running prune (>60s)
|
|
// would re-fire on the next tick while the first is still running.
|
|
func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job, error) {
|
|
row := s.db.QueryRowContext(ctx,
|
|
`SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id,
|
|
started_at, finished_at, exit_code, stats, error, created_at
|
|
FROM jobs
|
|
WHERE host_id = ? AND kind = ?
|
|
ORDER BY created_at DESC
|
|
LIMIT 1`, hostID, kind)
|
|
var (
|
|
j Job
|
|
schedID sql.NullString
|
|
actorID sql.NullString
|
|
startedAt sql.NullString
|
|
finishedAt sql.NullString
|
|
exitCode sql.NullInt64
|
|
stats sql.NullString
|
|
errMsg sql.NullString
|
|
createdAt string
|
|
)
|
|
if err := row.Scan(&j.ID, &j.HostID, &j.Kind, &j.Status, &schedID,
|
|
&j.ActorKind, &actorID, &startedAt, &finishedAt,
|
|
&exitCode, &stats, &errMsg, &createdAt); err != nil {
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
return nil, ErrNotFound
|
|
}
|
|
return nil, fmt.Errorf("store: scan latest job by kind: %w", err)
|
|
}
|
|
if schedID.Valid {
|
|
s := schedID.String
|
|
j.ScheduledID = &s
|
|
}
|
|
if actorID.Valid {
|
|
s := actorID.String
|
|
j.ActorID = &s
|
|
}
|
|
if startedAt.Valid {
|
|
t, _ := time.Parse(time.RFC3339Nano, startedAt.String)
|
|
j.StartedAt = &t
|
|
}
|
|
if finishedAt.Valid {
|
|
t, _ := time.Parse(time.RFC3339Nano, finishedAt.String)
|
|
j.FinishedAt = &t
|
|
}
|
|
if exitCode.Valid {
|
|
i := int(exitCode.Int64)
|
|
j.ExitCode = &i
|
|
}
|
|
if stats.Valid && stats.String != "" {
|
|
j.Stats = json.RawMessage(stats.String)
|
|
}
|
|
if errMsg.Valid {
|
|
s := errMsg.String
|
|
j.Error = &s
|
|
}
|
|
t, _ := time.Parse(time.RFC3339Nano, createdAt)
|
|
j.CreatedAt = t
|
|
return &j, nil
|
|
}
|
|
|
|
// HasJobOfKind reports whether any job of the given kind exists for
|
|
// this host, regardless of status. Used by the auto-init path on
|
|
// agent hello to decide whether to dispatch a fresh `restic init` —
|
|
// once we've tried once we don't auto-retry, even on failure
|
|
// (failed init usually means bad creds; retrying every reconnect
|
|
// just piles up failed rows). The operator can re-init manually via
|
|
// the Repo page's danger zone.
|
|
func (s *Store) HasJobOfKind(ctx context.Context, hostID, kind string) (bool, error) {
|
|
var n int
|
|
err := s.db.QueryRowContext(ctx,
|
|
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = ?`,
|
|
hostID, kind).Scan(&n)
|
|
if err != nil {
|
|
return false, fmt.Errorf("store: count jobs of kind: %w", err)
|
|
}
|
|
return n > 0, nil
|
|
}
|
|
|
|
func nullableStr(s string) any {
|
|
if s == "" {
|
|
return nil
|
|
}
|
|
return s
|
|
}
|