nixfleet_state_machine/event.rs
1//! Reducer inputs. Maps RFC-0005 §4.2 wire events onto reducer transitions.
2//!
3//! `Local*` variants are synthesized by the agent runtime from its own
4//! workers (probe outputs, activation outcomes, sustained-failure timing).
5//! `Remote*` variants are synthesized by the CP runtime from inbound POSTs
6//! to `/v1/agent/events`. Both halves drive the same transitions: a
7//! `LocalActivationCompleted` on the agent and a `RemoteActivationCompleted`
8//! on the CP mirror both move `Activating → Soaking` with the same
9//! invariants applied (RFC-0006 §2 principle 4).
10
11use chrono::{DateTime, Utc};
12use nixfleet_proto::OnHealthFailure;
13
14use crate::state::{ClosureHash, ProbeMode, ProbeName, ProbeStatus, ProbeSubResult};
15
16/// One entry in a `LocalProbeTopologyDeclared` / `RemoteProbeTopologyDeclared`
17/// event. Carries the per-probe metadata CP needs to evaluate the gate
18/// without reading the agent's filesystem (RFC-0007 §8). Threading
19/// `mode` per-event would also work, but the upfront declaration also
20/// lets the gate distinguish "this enforce probe declared but never
21/// reported" from "no enforce probe declared at all" — the difference
22/// matters for wave-hold semantics.
23#[derive(Debug, Clone, PartialEq, Eq)]
24pub struct ProbeTopologyEntry {
25 pub probe_name: ProbeName,
26 pub kind: String,
27 pub mode: ProbeMode,
28}
29
30/// Inputs to [`crate::step`]. Each variant maps to exactly one (legal)
31/// outbound or inbound RFC-0005 §4.2 event.
32#[derive(Debug, Clone)]
33pub enum Event {
34 // ─────────────────────────────────────────────────────────────────────
35 // Agent-local events (synthesized by agent workers, drive agent state)
36 // ─────────────────────────────────────────────────────────────────────
37 /// Agent has received a Dispatch via long-poll, verified
38 /// `target_closure` against the signed manifest, and is about to fire
39 /// the switch. Drives `Pending → Activating` and emits a `DispatchAck`
40 /// outbound event + a `LocalFireSwitch` effect.
41 ///
42 /// `target_closure` is the dispatch target validated by the longpoll
43 /// worker via `manifest_cache.ensure_for_dispatch` (which fetches
44 /// the per-rollout manifest and asserts the manifest's
45 /// `host_set[hostname].target_closure` matches the dispatched
46 /// value). Carrying it on the event eliminates the reducer's
47 /// bootstrap dependency on its own cached `SignedManifestSet`,
48 /// which is fed by a separate worker (`agent_manifest_poll`) on
49 /// a slower cadence and can be stale when a new rollout's
50 /// channel_ref has just been published. RFC-0004 §1 invariant 1:
51 /// the longpoll's just-verified value is the single source of
52 /// truth at bootstrap time.
53 ///
54 /// `soak_due_at` is the CP-resolved soak deadline carried verbatim
55 /// from the `/v1/agent/dispatch` response
56 /// (`DispatchResponse.soak_due_at`, computed by
57 /// `runtime::applier::open_rollout` from the manifest's
58 /// `rollout_policies[policy].waves[wave_index].soak_minutes`). CP
59 /// is the single source of truth for soak resolution per
60 /// RFC-0004 §1 invariant 1; the agent's bootstrap reads this
61 /// value into `state.soak_due_at` and the convergence-emission
62 /// pass-gate reads it back.
63 LocalActivate {
64 current_closure_at_dispatch: ClosureHash,
65 target_closure: ClosureHash,
66 received_at: DateTime<Utc>,
67 soak_due_at: DateTime<Utc>,
68 seq: u64,
69 },
70
71 /// Activation worker has begun executing `switch-to-configuration`.
72 /// Visibility-only; no state change.
73 LocalActivationStarted {
74 started_at: DateTime<Utc>,
75 switch_method: String,
76 seq: u64,
77 },
78
79 /// `switch-to-configuration` returned success. Drives
80 /// `Activating → Soaking`, sets `current_closure`, resets probe cache.
81 LocalActivationCompleted {
82 observed_current_closure: ClosureHash,
83 exit_code: i32,
84 completed_at: DateTime<Utc>,
85 seq: u64,
86 },
87
88 /// Activation pipeline set the profile + bootloader but skipped the
89 /// live `switch-to-configuration` because `component` (dbus, systemd,
90 /// kernel, init) cannot be safely swapped on a running system
91 /// (matches nixos-rebuild's own refusal). The target activates on
92 /// next reboot. The host is "soft-staged" — profile is correct, but
93 /// `/run/current-system` still points at the pre-switch closure.
94 ///
95 /// State stays Activating: the activation is not complete until the
96 /// host reboots and the new generation takes. When the operator
97 /// reboots and the agent restarts, the boot-recovery handshake
98 /// observes `current_closure == target_closure` and CP's
99 /// `handle_heartbeat` synthesizes `RemoteActivationCompleted`
100 /// (LIFT #1; recovery.rs scenario 3). The cascade resumes
101 /// automatically post-reboot.
102 ///
103 /// Visibility-only at the state-machine layer; emits an outbound
104 /// `ActivationDeferred` event so CP's event_log + operator queries
105 /// can surface the deferred-reboot condition (the host appears
106 /// stuck at Activating to the planner; this event tells operators
107 /// why).
108 LocalActivationDeferred {
109 component: String,
110 deferred_at: DateTime<Utc>,
111 seq: u64,
112 },
113
114 /// `switch-to-configuration` returned failure. Drives
115 /// `Activating → Failed`. The agent reads `onHealthFailure` from policy
116 /// and (if `rollback-and-halt`) immediately fires the rollback in the
117 /// same handler — the next event will be `LocalRollbackCompleted`.
118 LocalActivationFailed {
119 exit_code: i32,
120 stderr_tail: String,
121 failed_at: DateTime<Utc>,
122 seq: u64,
123 },
124
125 /// Agent's on-disk probe declarations (RFC-0007 §8). Emitted once
126 /// per `LocalActivationCompleted` so the CP can record the
127 /// authoritative declared-probe set without reading the agent's
128 /// filesystem. Visibility-only; no state change.
129 LocalProbeTopologyDeclared {
130 probes: Vec<ProbeTopologyEntry>,
131 declared_at: DateTime<Utc>,
132 seq: u64,
133 },
134
135 /// First probe run since activation observed a result (any status).
136 /// Stamps `probe_observed_first_at`; soak gate may now consult probe
137 /// state. Visibility + gate-enable; no state change.
138 LocalProbeObservedFirst {
139 probe_name: ProbeName,
140 mode: ProbeMode,
141 observed_at: DateTime<Utc>,
142 seq: u64,
143 },
144
145 /// A probe ran and produced a result. Updates the probe map. Visibility
146 /// + state update on `probes` field; no `HostState` change.
147 ///
148 /// `sub_results` carries per-control accounting for evidence/custom-
149 /// framework probes (one entry per (control, framework, article) tuple
150 /// with `effective_mode` + `override_reason`). The reducer does not
151 /// consult these — gate decisions key off the aggregate `status` — but
152 /// it threads them onto the OutboundAgentEvent so the signed wire
153 /// payload + CP event_log preserve the audit trail an auditor needs
154 /// to answer "why was control X downgraded?" `None` for non-evidence
155 /// probe kinds.
156 LocalProbeResult {
157 probe_name: ProbeName,
158 mode: ProbeMode,
159 status: ProbeStatus,
160 observed_at: DateTime<Utc>,
161 failure_reason: Option<String>,
162 sub_results: Option<Vec<ProbeSubResult>>,
163 seq: u64,
164 },
165
166 /// First `Pass → Fail` (or first-ever `Fail`) for any declared probe.
167 /// Stamps `probe_failure_first_at`; sweep window now ticks from this
168 /// exact agent-supplied time.
169 LocalProbeFailureFirst {
170 probe_name: ProbeName,
171 mode: ProbeMode,
172 first_failed_at: DateTime<Utc>,
173 seq: u64,
174 },
175
176 /// Agent's local sweep timer crossed `HEALTH_FAILURE_THRESHOLD_SECS`.
177 /// Drives `Soaking → Failed`. Agent records which policy branch it's
178 /// applying (read from the signed manifest); applier follows.
179 LocalSustainedFailureCrossed {
180 failed_at: DateTime<Utc>,
181 sustained_duration_secs: u64,
182 failing_probes: Vec<ProbeName>,
183 policy_applied: OnHealthFailure,
184 seq: u64,
185 },
186
187 /// Agent has finished rollback. Drives `Failed → Reverted` and
188 /// populates `reverted_to`, `current_closure`.
189 LocalRollbackCompleted {
190 reverted_to_closure: ClosureHash,
191 exit_code: i32,
192 completed_at: DateTime<Utc>,
193 seq: u64,
194 },
195
196 /// Agent has re-verified the three Converged invariants
197 /// (`soak_due_at` elapsed, all enforce-mode probes Pass, `current ==
198 /// target`) and is declaring success. Observe and Disabled probes do
199 /// not gate per RFC-0007 §3.3 (ProbeMode docstring, state.rs). Drives
200 /// `Soaking → Converged`.
201 LocalConvergedReached {
202 converged_at: DateTime<Utc>,
203 current_closure: ClosureHash,
204 seq: u64,
205 },
206
207 // ─────────────────────────────────────────────────────────────────────
208 // CP-mirror events (synthesized from inbound agent POSTs)
209 // ─────────────────────────────────────────────────────────────────────
210 /// Mirrors `LocalActivate`. CP receives `DispatchAck` at
211 /// `/v1/agent/events`; drives `Pending → Activating` in the mirror.
212 RemoteDispatchAck {
213 current_closure_at_dispatch: ClosureHash,
214 received_at: DateTime<Utc>,
215 seq: u64,
216 },
217
218 RemoteActivationStarted {
219 started_at: DateTime<Utc>,
220 switch_method: String,
221 seq: u64,
222 },
223
224 /// CP-side mirror of `LocalActivationDeferred`. CP receives
225 /// `ActivationDeferred` at `/v1/agent/events`; no state change in
226 /// the mirror — the host stays Activating until the operator
227 /// reboots and LIFT #1's heartbeat synthesis advances state.
228 /// Emits a `RemoteAppendEventLog` effect so event_log captures the
229 /// deferral for operator queries + replay re-derivability.
230 RemoteActivationDeferred {
231 component: String,
232 deferred_at: DateTime<Utc>,
233 seq: u64,
234 },
235
236 RemoteActivationCompleted {
237 observed_current_closure: ClosureHash,
238 exit_code: i32,
239 completed_at: DateTime<Utc>,
240 seq: u64,
241 },
242
243 RemoteActivationFailed {
244 exit_code: i32,
245 stderr_tail: String,
246 failed_at: DateTime<Utc>,
247 seq: u64,
248 },
249
250 RemoteProbeTopologyDeclared {
251 probes: Vec<ProbeTopologyEntry>,
252 declared_at: DateTime<Utc>,
253 seq: u64,
254 },
255
256 RemoteProbeObservedFirst {
257 probe_name: ProbeName,
258 mode: ProbeMode,
259 observed_at: DateTime<Utc>,
260 seq: u64,
261 },
262
263 RemoteProbeResult {
264 probe_name: ProbeName,
265 mode: ProbeMode,
266 status: ProbeStatus,
267 observed_at: DateTime<Utc>,
268 failure_reason: Option<String>,
269 sub_results: Option<Vec<ProbeSubResult>>,
270 seq: u64,
271 },
272
273 RemoteProbeFailureFirst {
274 probe_name: ProbeName,
275 mode: ProbeMode,
276 first_failed_at: DateTime<Utc>,
277 seq: u64,
278 },
279
280 RemoteFailed {
281 failed_at: DateTime<Utc>,
282 sustained_duration_secs: u64,
283 failing_probes: Vec<ProbeName>,
284 policy_applied: OnHealthFailure,
285 seq: u64,
286 },
287
288 RemoteRollbackComplete {
289 reverted_to_closure: ClosureHash,
290 exit_code: i32,
291 completed_at: DateTime<Utc>,
292 seq: u64,
293 },
294
295 RemoteConverged {
296 converged_at: DateTime<Utc>,
297 current_closure: ClosureHash,
298 seq: u64,
299 },
300}
301
302impl Event {
303 /// The monotonic per-(host, rollout) sequence number every event
304 /// carries (RFC-0005 §4). Used by the runtime layer for deduplication
305 /// and out-of-order detection; the reducer itself just records it on
306 /// the state to support `Replay-From` (RFC-0005 §4.3).
307 pub fn seq(&self) -> u64 {
308 match self {
309 Event::LocalActivate { seq, .. }
310 | Event::LocalActivationStarted { seq, .. }
311 | Event::LocalActivationCompleted { seq, .. }
312 | Event::LocalActivationDeferred { seq, .. }
313 | Event::LocalActivationFailed { seq, .. }
314 | Event::LocalProbeTopologyDeclared { seq, .. }
315 | Event::LocalProbeObservedFirst { seq, .. }
316 | Event::LocalProbeResult { seq, .. }
317 | Event::LocalProbeFailureFirst { seq, .. }
318 | Event::LocalSustainedFailureCrossed { seq, .. }
319 | Event::LocalRollbackCompleted { seq, .. }
320 | Event::LocalConvergedReached { seq, .. }
321 | Event::RemoteDispatchAck { seq, .. }
322 | Event::RemoteActivationStarted { seq, .. }
323 | Event::RemoteActivationCompleted { seq, .. }
324 | Event::RemoteActivationDeferred { seq, .. }
325 | Event::RemoteActivationFailed { seq, .. }
326 | Event::RemoteProbeTopologyDeclared { seq, .. }
327 | Event::RemoteProbeObservedFirst { seq, .. }
328 | Event::RemoteProbeResult { seq, .. }
329 | Event::RemoteProbeFailureFirst { seq, .. }
330 | Event::RemoteFailed { seq, .. }
331 | Event::RemoteRollbackComplete { seq, .. }
332 | Event::RemoteConverged { seq, .. } => *seq,
333 }
334 }
335}