nixfleet_state_machine/
event.rs

1//! Reducer inputs. Maps RFC-0005 §4.2 wire events onto reducer transitions.
2//!
3//! `Local*` variants are synthesized by the agent runtime from its own
4//! workers (probe outputs, activation outcomes, sustained-failure timing).
5//! `Remote*` variants are synthesized by the CP runtime from inbound POSTs
6//! to `/v1/agent/events`. Both halves drive the same transitions: a
7//! `LocalActivationCompleted` on the agent and a `RemoteActivationCompleted`
8//! on the CP mirror both move `Activating → Soaking` with the same
9//! invariants applied (RFC-0006 §2 principle 4).
10
11use chrono::{DateTime, Utc};
12use nixfleet_proto::OnHealthFailure;
13
14use crate::state::{ClosureHash, ProbeMode, ProbeName, ProbeStatus, ProbeSubResult};
15
16/// One entry in a `LocalProbeTopologyDeclared` / `RemoteProbeTopologyDeclared`
17/// event. Carries the per-probe metadata CP needs to evaluate the gate
18/// without reading the agent's filesystem (RFC-0007 §8). Threading
19/// `mode` per-event would also work, but the upfront declaration also
20/// lets the gate distinguish "this enforce probe declared but never
21/// reported" from "no enforce probe declared at all" — the difference
22/// matters for wave-hold semantics.
23#[derive(Debug, Clone, PartialEq, Eq)]
24pub struct ProbeTopologyEntry {
25    pub probe_name: ProbeName,
26    pub kind: String,
27    pub mode: ProbeMode,
28}
29
30/// Inputs to [`crate::step`]. Each variant maps to exactly one (legal)
31/// outbound or inbound RFC-0005 §4.2 event.
32#[derive(Debug, Clone)]
33pub enum Event {
34    // ─────────────────────────────────────────────────────────────────────
35    // Agent-local events (synthesized by agent workers, drive agent state)
36    // ─────────────────────────────────────────────────────────────────────
37    /// Agent has received a Dispatch via long-poll, verified
38    /// `target_closure` against the signed manifest, and is about to fire
39    /// the switch. Drives `Pending → Activating` and emits a `DispatchAck`
40    /// outbound event + a `LocalFireSwitch` effect.
41    ///
42    /// `target_closure` is the dispatch target validated by the longpoll
43    /// worker via `manifest_cache.ensure_for_dispatch` (which fetches
44    /// the per-rollout manifest and asserts the manifest's
45    /// `host_set[hostname].target_closure` matches the dispatched
46    /// value). Carrying it on the event eliminates the reducer's
47    /// bootstrap dependency on its own cached `SignedManifestSet`,
48    /// which is fed by a separate worker (`agent_manifest_poll`) on
49    /// a slower cadence and can be stale when a new rollout's
50    /// channel_ref has just been published. RFC-0004 §1 invariant 1:
51    /// the longpoll's just-verified value is the single source of
52    /// truth at bootstrap time.
53    ///
54    /// `soak_due_at` is the CP-resolved soak deadline carried verbatim
55    /// from the `/v1/agent/dispatch` response
56    /// (`DispatchResponse.soak_due_at`, computed by
57    /// `runtime::applier::open_rollout` from the manifest's
58    /// `rollout_policies[policy].waves[wave_index].soak_minutes`). CP
59    /// is the single source of truth for soak resolution per
60    /// RFC-0004 §1 invariant 1; the agent's bootstrap reads this
61    /// value into `state.soak_due_at` and the convergence-emission
62    /// pass-gate reads it back.
63    LocalActivate {
64        current_closure_at_dispatch: ClosureHash,
65        target_closure: ClosureHash,
66        received_at: DateTime<Utc>,
67        soak_due_at: DateTime<Utc>,
68        seq: u64,
69    },
70
71    /// Activation worker has begun executing `switch-to-configuration`.
72    /// Visibility-only; no state change.
73    LocalActivationStarted {
74        started_at: DateTime<Utc>,
75        switch_method: String,
76        seq: u64,
77    },
78
79    /// `switch-to-configuration` returned success. Drives
80    /// `Activating → Soaking`, sets `current_closure`, resets probe cache.
81    LocalActivationCompleted {
82        observed_current_closure: ClosureHash,
83        exit_code: i32,
84        completed_at: DateTime<Utc>,
85        seq: u64,
86    },
87
88    /// Activation pipeline set the profile + bootloader but skipped the
89    /// live `switch-to-configuration` because `component` (dbus, systemd,
90    /// kernel, init) cannot be safely swapped on a running system
91    /// (matches nixos-rebuild's own refusal). The target activates on
92    /// next reboot. The host is "soft-staged" — profile is correct, but
93    /// `/run/current-system` still points at the pre-switch closure.
94    ///
95    /// State stays Activating: the activation is not complete until the
96    /// host reboots and the new generation takes. When the operator
97    /// reboots and the agent restarts, the boot-recovery handshake
98    /// observes `current_closure == target_closure` and CP's
99    /// `handle_heartbeat` synthesizes `RemoteActivationCompleted`
100    /// (LIFT #1; recovery.rs scenario 3). The cascade resumes
101    /// automatically post-reboot.
102    ///
103    /// Visibility-only at the state-machine layer; emits an outbound
104    /// `ActivationDeferred` event so CP's event_log + operator queries
105    /// can surface the deferred-reboot condition (the host appears
106    /// stuck at Activating to the planner; this event tells operators
107    /// why).
108    LocalActivationDeferred {
109        component: String,
110        deferred_at: DateTime<Utc>,
111        seq: u64,
112    },
113
114    /// `switch-to-configuration` returned failure. Drives
115    /// `Activating → Failed`. The agent reads `onHealthFailure` from policy
116    /// and (if `rollback-and-halt`) immediately fires the rollback in the
117    /// same handler — the next event will be `LocalRollbackCompleted`.
118    LocalActivationFailed {
119        exit_code: i32,
120        stderr_tail: String,
121        failed_at: DateTime<Utc>,
122        seq: u64,
123    },
124
125    /// Agent's on-disk probe declarations (RFC-0007 §8). Emitted once
126    /// per `LocalActivationCompleted` so the CP can record the
127    /// authoritative declared-probe set without reading the agent's
128    /// filesystem. Visibility-only; no state change.
129    LocalProbeTopologyDeclared {
130        probes: Vec<ProbeTopologyEntry>,
131        declared_at: DateTime<Utc>,
132        seq: u64,
133    },
134
135    /// First probe run since activation observed a result (any status).
136    /// Stamps `probe_observed_first_at`; soak gate may now consult probe
137    /// state. Visibility + gate-enable; no state change.
138    LocalProbeObservedFirst {
139        probe_name: ProbeName,
140        mode: ProbeMode,
141        observed_at: DateTime<Utc>,
142        seq: u64,
143    },
144
145    /// A probe ran and produced a result. Updates the probe map. Visibility
146    /// + state update on `probes` field; no `HostState` change.
147    ///
148    /// `sub_results` carries per-control accounting for evidence/custom-
149    /// framework probes (one entry per (control, framework, article) tuple
150    /// with `effective_mode` + `override_reason`). The reducer does not
151    /// consult these — gate decisions key off the aggregate `status` — but
152    /// it threads them onto the OutboundAgentEvent so the signed wire
153    /// payload + CP event_log preserve the audit trail an auditor needs
154    /// to answer "why was control X downgraded?" `None` for non-evidence
155    /// probe kinds.
156    LocalProbeResult {
157        probe_name: ProbeName,
158        mode: ProbeMode,
159        status: ProbeStatus,
160        observed_at: DateTime<Utc>,
161        failure_reason: Option<String>,
162        sub_results: Option<Vec<ProbeSubResult>>,
163        seq: u64,
164    },
165
166    /// First `Pass → Fail` (or first-ever `Fail`) for any declared probe.
167    /// Stamps `probe_failure_first_at`; sweep window now ticks from this
168    /// exact agent-supplied time.
169    LocalProbeFailureFirst {
170        probe_name: ProbeName,
171        mode: ProbeMode,
172        first_failed_at: DateTime<Utc>,
173        seq: u64,
174    },
175
176    /// Agent's local sweep timer crossed `HEALTH_FAILURE_THRESHOLD_SECS`.
177    /// Drives `Soaking → Failed`. Agent records which policy branch it's
178    /// applying (read from the signed manifest); applier follows.
179    LocalSustainedFailureCrossed {
180        failed_at: DateTime<Utc>,
181        sustained_duration_secs: u64,
182        failing_probes: Vec<ProbeName>,
183        policy_applied: OnHealthFailure,
184        seq: u64,
185    },
186
187    /// Agent has finished rollback. Drives `Failed → Reverted` and
188    /// populates `reverted_to`, `current_closure`.
189    LocalRollbackCompleted {
190        reverted_to_closure: ClosureHash,
191        exit_code: i32,
192        completed_at: DateTime<Utc>,
193        seq: u64,
194    },
195
196    /// Agent has re-verified the three Converged invariants
197    /// (`soak_due_at` elapsed, all enforce-mode probes Pass, `current ==
198    /// target`) and is declaring success. Observe and Disabled probes do
199    /// not gate per RFC-0007 §3.3 (ProbeMode docstring, state.rs). Drives
200    /// `Soaking → Converged`.
201    LocalConvergedReached {
202        converged_at: DateTime<Utc>,
203        current_closure: ClosureHash,
204        seq: u64,
205    },
206
207    // ─────────────────────────────────────────────────────────────────────
208    // CP-mirror events (synthesized from inbound agent POSTs)
209    // ─────────────────────────────────────────────────────────────────────
210    /// Mirrors `LocalActivate`. CP receives `DispatchAck` at
211    /// `/v1/agent/events`; drives `Pending → Activating` in the mirror.
212    RemoteDispatchAck {
213        current_closure_at_dispatch: ClosureHash,
214        received_at: DateTime<Utc>,
215        seq: u64,
216    },
217
218    RemoteActivationStarted {
219        started_at: DateTime<Utc>,
220        switch_method: String,
221        seq: u64,
222    },
223
224    /// CP-side mirror of `LocalActivationDeferred`. CP receives
225    /// `ActivationDeferred` at `/v1/agent/events`; no state change in
226    /// the mirror — the host stays Activating until the operator
227    /// reboots and LIFT #1's heartbeat synthesis advances state.
228    /// Emits a `RemoteAppendEventLog` effect so event_log captures the
229    /// deferral for operator queries + replay re-derivability.
230    RemoteActivationDeferred {
231        component: String,
232        deferred_at: DateTime<Utc>,
233        seq: u64,
234    },
235
236    RemoteActivationCompleted {
237        observed_current_closure: ClosureHash,
238        exit_code: i32,
239        completed_at: DateTime<Utc>,
240        seq: u64,
241    },
242
243    RemoteActivationFailed {
244        exit_code: i32,
245        stderr_tail: String,
246        failed_at: DateTime<Utc>,
247        seq: u64,
248    },
249
250    RemoteProbeTopologyDeclared {
251        probes: Vec<ProbeTopologyEntry>,
252        declared_at: DateTime<Utc>,
253        seq: u64,
254    },
255
256    RemoteProbeObservedFirst {
257        probe_name: ProbeName,
258        mode: ProbeMode,
259        observed_at: DateTime<Utc>,
260        seq: u64,
261    },
262
263    RemoteProbeResult {
264        probe_name: ProbeName,
265        mode: ProbeMode,
266        status: ProbeStatus,
267        observed_at: DateTime<Utc>,
268        failure_reason: Option<String>,
269        sub_results: Option<Vec<ProbeSubResult>>,
270        seq: u64,
271    },
272
273    RemoteProbeFailureFirst {
274        probe_name: ProbeName,
275        mode: ProbeMode,
276        first_failed_at: DateTime<Utc>,
277        seq: u64,
278    },
279
280    RemoteFailed {
281        failed_at: DateTime<Utc>,
282        sustained_duration_secs: u64,
283        failing_probes: Vec<ProbeName>,
284        policy_applied: OnHealthFailure,
285        seq: u64,
286    },
287
288    RemoteRollbackComplete {
289        reverted_to_closure: ClosureHash,
290        exit_code: i32,
291        completed_at: DateTime<Utc>,
292        seq: u64,
293    },
294
295    RemoteConverged {
296        converged_at: DateTime<Utc>,
297        current_closure: ClosureHash,
298        seq: u64,
299    },
300}
301
302impl Event {
303    /// The monotonic per-(host, rollout) sequence number every event
304    /// carries (RFC-0005 §4). Used by the runtime layer for deduplication
305    /// and out-of-order detection; the reducer itself just records it on
306    /// the state to support `Replay-From` (RFC-0005 §4.3).
307    pub fn seq(&self) -> u64 {
308        match self {
309            Event::LocalActivate { seq, .. }
310            | Event::LocalActivationStarted { seq, .. }
311            | Event::LocalActivationCompleted { seq, .. }
312            | Event::LocalActivationDeferred { seq, .. }
313            | Event::LocalActivationFailed { seq, .. }
314            | Event::LocalProbeTopologyDeclared { seq, .. }
315            | Event::LocalProbeObservedFirst { seq, .. }
316            | Event::LocalProbeResult { seq, .. }
317            | Event::LocalProbeFailureFirst { seq, .. }
318            | Event::LocalSustainedFailureCrossed { seq, .. }
319            | Event::LocalRollbackCompleted { seq, .. }
320            | Event::LocalConvergedReached { seq, .. }
321            | Event::RemoteDispatchAck { seq, .. }
322            | Event::RemoteActivationStarted { seq, .. }
323            | Event::RemoteActivationCompleted { seq, .. }
324            | Event::RemoteActivationDeferred { seq, .. }
325            | Event::RemoteActivationFailed { seq, .. }
326            | Event::RemoteProbeTopologyDeclared { seq, .. }
327            | Event::RemoteProbeObservedFirst { seq, .. }
328            | Event::RemoteProbeResult { seq, .. }
329            | Event::RemoteProbeFailureFirst { seq, .. }
330            | Event::RemoteFailed { seq, .. }
331            | Event::RemoteRollbackComplete { seq, .. }
332            | Event::RemoteConverged { seq, .. } => *seq,
333        }
334    }
335}