e871b05b38
CI / Test (linux/amd64) (pull_request) Successful in 34s
CI / Lint (pull_request) Failing after 16s
CI / Build (windows/amd64) (pull_request) Successful in 22s
CI / Build (linux/amd64) (pull_request) Successful in 20s
CI / Build (linux/arm64) (pull_request) Successful in 21s
Cleanup pass over the repo so CI can enforce lint going forward
without the only-new-issues escape hatch:
* gofumpt -w across the tree (31 hits, all formatting)
* misspell --fix (25 hits, US-locale spelling) — but reverted on
api.JobCancelled = "cancelled" since that literal is the wire +
DB CHECK constraint value, plus matched the case in store/fleet.go
back to "cancelled" and added //nolint:misspell on both for the
next time someone reaches for the auto-fix
* Wrap every `defer rows.Close()` / `defer stmt.Close()` /
`defer res.Body.Close()` in `defer func() { _ = .Close() }()`
to satisfy errcheck without losing the close itself
* websocket.Dial callers (1 prod, 4 tests) now capture + close the
upgrade response Body — coder/websocket can return res with a nil
Body on success, so the test deferred-closes guard against that
* Annotate the two genuine-by-design nilerr cases with //nolint
comments explaining why nil-on-error is the contract (cookie
missing = no session; ctx cancelled mid-backoff = clean shutdown)
* Add brief godoc on the 10 exported const groups + types that
revive flagged (api.HostOS/HostArch/JobKind/JobStatus/LogStream/
ErrorCode, restic.EventKind, store.Role, web.FS)
* Drop the unused (*Server).userByID method
* Inline the unparam baseView(active) — every UI page is under
the dashboard primary nav today
Result: `golangci-lint run ./...` reports 0 issues. CI lint job
no longer needs only-new-issues: true; X-06 follow-up entry in
tasks.md removed.
170 lines
5.1 KiB
Go
170 lines
5.1 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/robfig/cron/v3"
|
|
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
|
)
|
|
|
|
// Sender abstracts away the agent's outbound WS channel — we use it
|
|
// to fire schedule.fire and schedule.ack envelopes back at the
|
|
// server. Same shape as runner.Sender; deliberately not shared so
|
|
// the scheduler can be tested without dragging in the runner.
|
|
type Sender interface {
|
|
Send(env api.Envelope) error
|
|
}
|
|
|
|
// Scheduler maintains the agent's local cron entries. Schedules
|
|
// arrive from the server via Apply (driven by MsgScheduleSet); on
|
|
// each fire, the entry sends a schedule.fire to the server and
|
|
// lets the server's existing dispatch path turn that into a
|
|
// command.run. The scheduler itself never builds CommandRunPayloads.
|
|
//
|
|
// Lifecycle:
|
|
// - Start once at agent boot.
|
|
// - Apply on every MsgScheduleSet — replaces the active cron with
|
|
// a fresh one, then emits schedule.ack with the version we just
|
|
// applied.
|
|
// - Stop on agent shutdown.
|
|
//
|
|
// The active Sender is updated on every Apply call. This handles
|
|
// reconnects naturally: a new connection's first MsgScheduleSet
|
|
// re-arms the scheduler with a working tx; cron entries that fire
|
|
// against a dropped connection just log and skip the tick.
|
|
type Scheduler struct {
|
|
mu sync.Mutex
|
|
current *cron.Cron
|
|
version int64
|
|
tx Sender
|
|
}
|
|
|
|
// New builds a Scheduler. Doesn't start any cron yet — Apply is
|
|
// what brings the loop alive.
|
|
func New() *Scheduler {
|
|
return &Scheduler{}
|
|
}
|
|
|
|
// Stop halts whatever cron is currently running. Safe to call
|
|
// multiple times.
|
|
func (s *Scheduler) Stop() {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
if s.current != nil {
|
|
<-s.current.Stop().Done()
|
|
s.current = nil
|
|
}
|
|
}
|
|
|
|
// Apply reconciles the active cron with payload. Stops the old cron
|
|
// (waiting for in-flight entries to return), builds a new one from
|
|
// every enabled entry, starts it, and emits schedule.ack with
|
|
// payload.Version. Schedule entries with malformed cron exprs are
|
|
// logged and skipped — the server's validator should have caught
|
|
// these, but better skip-and-warn than crash the loop.
|
|
//
|
|
// Payload's order doesn't matter; we always rebuild from scratch.
|
|
// Empty Schedules is a valid input that effectively disables every
|
|
// timed job for this host.
|
|
func (s *Scheduler) Apply(payload api.ScheduleSetPayload, tx Sender) {
|
|
s.mu.Lock()
|
|
s.tx = tx
|
|
|
|
// Stop the previous cron, if any. cron.Stop returns once the
|
|
// scheduler has stopped firing new entries; in-flight ones
|
|
// continue in their own goroutines, which is what we want
|
|
// (otherwise a long-running backup would block reconciliation).
|
|
if s.current != nil {
|
|
<-s.current.Stop().Done()
|
|
s.current = nil
|
|
}
|
|
|
|
c := cron.New()
|
|
added := 0
|
|
for _, sch := range payload.Schedules {
|
|
if !sch.Enabled {
|
|
continue
|
|
}
|
|
// Capture by value so the closure doesn't share id across iters.
|
|
entry := sch
|
|
_, err := c.AddFunc(entry.CronExpr, func() {
|
|
s.fire(entry)
|
|
})
|
|
if err != nil {
|
|
slog.Warn("scheduler: skipping entry with bad cron expr",
|
|
"schedule_id", entry.ID, "expr", entry.CronExpr, "err", err)
|
|
continue
|
|
}
|
|
added++
|
|
}
|
|
c.Start()
|
|
s.current = c
|
|
s.version = payload.Version
|
|
ackTx := s.tx
|
|
s.mu.Unlock()
|
|
|
|
slog.Info("scheduler: applied", "version", payload.Version,
|
|
"received", len(payload.Schedules), "active", added)
|
|
|
|
// Ack outside the lock — Send() shouldn't take long, but holding
|
|
// s.mu across an external call would needlessly serialize other
|
|
// callers (e.g. a future Status() inspection from the UI).
|
|
ackEnv, err := api.Marshal(api.MsgScheduleAck, "", api.ScheduleAckPayload{
|
|
Version: payload.Version,
|
|
AppliedAt: time.Now().UTC(),
|
|
})
|
|
if err != nil {
|
|
slog.Error("scheduler: marshal schedule.ack", "err", err)
|
|
return
|
|
}
|
|
if ackTx == nil {
|
|
return
|
|
}
|
|
if err := ackTx.Send(ackEnv); err != nil {
|
|
slog.Warn("scheduler: send schedule.ack — server will retry on reconnect",
|
|
"version", payload.Version, "err", err)
|
|
}
|
|
}
|
|
|
|
// Version returns the schedule version currently applied. Useful for
|
|
// tests + diagnostics.
|
|
func (s *Scheduler) Version() int64 {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
return s.version
|
|
}
|
|
|
|
// fire runs when one of the cron entries' time arrives. Sends a
|
|
// schedule.fire envelope to the server, which is responsible for
|
|
// minting the job_id, persisting the row, and shipping back a
|
|
// command.run envelope that the agent's existing dispatcher will
|
|
// then execute. Fire-and-log: if the WS write fails we skip this
|
|
// tick — the next one will fire normally, and a flapping link is
|
|
// already noisy elsewhere.
|
|
func (s *Scheduler) fire(entry api.Schedule) {
|
|
s.mu.Lock()
|
|
tx := s.tx
|
|
s.mu.Unlock()
|
|
if tx == nil {
|
|
slog.Info("scheduler: tick fired with no active connection — skipping",
|
|
"schedule_id", entry.ID)
|
|
return
|
|
}
|
|
env, err := api.Marshal(api.MsgScheduleFire, "", api.ScheduleFirePayload{
|
|
ScheduleID: entry.ID,
|
|
ScheduledAt: time.Now().UTC(),
|
|
})
|
|
if err != nil {
|
|
slog.Error("scheduler: marshal schedule.fire",
|
|
"schedule_id", entry.ID, "err", err)
|
|
return
|
|
}
|
|
if err := tx.Send(env); err != nil {
|
|
slog.Warn("scheduler: send schedule.fire — skipping this tick",
|
|
"schedule_id", entry.ID, "err", err)
|
|
}
|
|
}
|