nixfleet_state_machine/state.rs
1//! Per-(rollout, host) reducer state. Maps RFC-0005 §5 `HostRolloutRecord`.
2
3use std::collections::HashMap;
4
5use chrono::{DateTime, Utc};
6use nixfleet_proto::OnHealthFailure;
7use serde::{Deserialize, Serialize};
8
9// `RolloutId` lives in `nixfleet-proto` (RFC-0008 §6.3): a newtype
10// around `"{channel}@{channel_ref}"` with constructor discipline
11// analogous to `Verified<T>`. Re-exported from here so downstream
12// crates that already `use nixfleet_state_machine::RolloutId` work
13// without a renaming churn pass.
14pub use nixfleet_proto::RolloutId;
15
16pub type ClosureHash = String;
17pub type ProbeName = String;
18
19/// 6-state rollout machine per RFC-0005 §3. Replaces the pre-v0.2 9-variant
20/// `nixfleet_proto::HostRolloutState` (the `Queued` / `Dispatched` /
21/// `ConfirmWindow` / `Healthy` / `Soaked` variants are removed — phases 4-6
22/// delete the old enum entirely once the new machine is wired through).
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
24pub enum HostState {
25 /// CP has issued a Dispatch; agent has not yet acked.
26 Pending,
27 /// Agent acked; switch-to-configuration is firing or has fired pending
28 /// confirmation.
29 Activating,
30 /// Activation pipeline set the profile + bootloader but skipped the
31 /// live `switch-to-configuration` because a critical component
32 /// (dbus/systemd/kernel/init) cannot be live-swapped on a running
33 /// system. The new generation activates on next reboot. **Ordering-
34 /// eligible** (host-edges + wave-promotion + advance_current_waves
35 /// treat Deferred ≡ Converged for cascade-progression purposes —
36 /// the host has done what it can within the rollout step) but
37 /// **not health-verified** (probes haven't run against the new
38 /// closure; channel-edges stays strict and waits for actual
39 /// Converged). On operator reboot, the agent's boot-recovery
40 /// handshake observes `current_closure == target_closure` and
41 /// CP's `handle_heartbeat` synthesis (LIFT #1) drives
42 /// `Deferred → Soaking` via `RemoteActivationCompleted`.
43 Deferred,
44 /// Agent reports activation succeeded; probes have started; soak window
45 /// has not yet elapsed.
46 Soaking,
47 /// Soak elapsed, probes passing, `current == declared`. **Terminal for
48 /// ordering** (successor channels may release).
49 Converged,
50 /// Sustained probe failure observed by the agent and reported to CP.
51 /// Agent has read `onHealthFailure` from the signed manifest and decided
52 /// autonomously what comes next (RFC-0005 §4.2 `Failed` event).
53 Failed,
54 /// Agent has completed rollback to prior closure. Channel-level
55 /// quarantine holds the bad SHA.
56 Reverted,
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
60#[serde(rename_all = "kebab-case")]
61pub enum ProbeStatus {
62 Pass,
63 Fail,
64}
65
66/// Per-probe gate participation (RFC-0007 §3.4). Threaded through every
67/// probe event so CP can decide whether to gate on a result without
68/// consulting a separate topology table.
69///
70/// - `Enforce` — wave gate consults latest result; Fail blocks promotion.
71/// - `Observe` — result recorded in event_log; gate ignores it.
72/// - `Disabled` — declared but agent does not run it (operator suppression).
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
74#[serde(rename_all = "kebab-case")]
75pub enum ProbeMode {
76 Enforce,
77 Observe,
78 Disabled,
79}
80
81impl Default for ProbeMode {
82 /// Conservative default for persisted state that pre-dates the
83 /// per-probe `mode` field on `ProbeRecord`: assume `Enforce`. Old
84 /// state thus retains its gating semantics on rehydration; the next
85 /// probe event from the agent updates the record to the actually-
86 /// declared mode per RFC-0007 §3.4.
87 fn default() -> Self {
88 ProbeMode::Enforce
89 }
90}
91
92/// Per-control sub-result on a `kind = evidence` probe (RFC-0007 §7.1).
93/// `None` aggregate for non-evidence probes; `Some(vec)` for evidence
94/// probes — the applier's `probe_failures` co-write iterates `sub_results`
95/// to populate one row per failing control_id.
96#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
97pub struct ProbeSubResult {
98 pub control_id: String,
99 pub status: ProbeStatus,
100 /// Framework label (e.g. "nis2-essential"). Echoes the probe
101 /// declaration's `framework` field.
102 pub framework: String,
103 /// Framework-specific control reference (e.g. "nis2:21(b)").
104 pub article: Option<String>,
105 /// Per-control effective mode after applying the probe's
106 /// `controlOverrides` / `controls` map. The compliance_wave gate
107 /// counts only `Enforce`-mode sub_results; `Observe` is recorded
108 /// for visibility but does not gate. `Disabled` controls are
109 /// dropped from sub_results entirely at the agent runner.
110 /// `serde(default)` falls back to the probe-level mode for events
111 /// emitted by pre-override-aware agents.
112 #[serde(default)]
113 pub effective_mode: ProbeMode,
114 /// Operator-declared audit rationale for the override, sourced
115 /// from `controlOverrides[control].reason` or
116 /// `controls[control].reason` in fleet.nix. `None` when no
117 /// override applies (effective_mode equals probe-level mode);
118 /// `Some("")` when the operator declared an override but left
119 /// the rationale blank. CP writes this verbatim into event_log
120 /// payloads so auditors recover "why was this control
121 /// downgraded" from the signed event stream alone.
122 #[serde(default, skip_serializing_if = "Option::is_none")]
123 pub override_reason: Option<String>,
124}
125
126#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
127pub struct ProbeRecord {
128 pub status: ProbeStatus,
129 /// Per-probe gate participation per RFC-0007 §3.4. Only
130 /// `Enforce`-mode probes contribute to the soak-gate `failing_probes`
131 /// builder; `Observe` and `Disabled` record events but do not gate.
132 /// `#[serde(default)]` (-> `Enforce`) keeps rehydration safe for
133 /// state persisted before this field existed.
134 #[serde(default)]
135 pub mode: ProbeMode,
136 pub last_observed_at: DateTime<Utc>,
137 pub last_pass_at: Option<DateTime<Utc>>,
138 pub failure_reason: Option<String>,
139}
140
141/// Per-(rollout, host) reducer state. Mirrors RFC-0005 §5
142/// `HostRolloutRecord`; the persistence schema in Phase 4 serializes this
143/// struct.
144///
145/// Every transition timestamp is agent-supplied (received via the wire and
146/// stamped onto the corresponding field by the reducer). `dispatched_at` is
147/// the lone exception — it's CP-issued, so CP wallclock is the source of
148/// truth there (RFC-0005 §5).
149#[derive(Debug, Clone)]
150pub struct HostRolloutState {
151 pub rollout_id: RolloutId,
152 pub hostname: String,
153 pub channel: String,
154 pub state: HostState,
155
156 // Closures
157 pub target_closure: ClosureHash,
158 pub current_closure_at_dispatch: Option<ClosureHash>,
159 pub current_closure: Option<ClosureHash>,
160 pub reverted_to: Option<ClosureHash>,
161
162 // Transition timestamps (agent-supplied unless noted)
163 pub dispatched_at: DateTime<Utc>,
164 pub dispatch_acked_at: Option<DateTime<Utc>>,
165 pub activation_started_at: Option<DateTime<Utc>>,
166 pub activation_completed_at: Option<DateTime<Utc>>,
167 pub activation_failed_at: Option<DateTime<Utc>>,
168 pub probe_observed_first_at: Option<DateTime<Utc>>,
169 pub probe_failure_first_at: Option<DateTime<Utc>>,
170 pub soak_due_at: Option<DateTime<Utc>>,
171 pub converged_at: Option<DateTime<Utc>>,
172 pub failed_at: Option<DateTime<Utc>>,
173 pub policy_applied: Option<OnHealthFailure>,
174 pub reverted_at: Option<DateTime<Utc>>,
175
176 // Live probe state, by probe name
177 pub probes: HashMap<ProbeName, ProbeRecord>,
178
179 /// Monotonic per (hostname, rollout_id). Gaps signal lost events;
180 /// out-of-order events are dropped with a warning at the runtime layer.
181 pub last_event_seq: u64,
182}
183
184impl HostRolloutState {
185 /// Construct the initial `Pending` state when CP queues a Dispatch (or
186 /// the agent receives one via long-poll on `/v1/agent/dispatch`).
187 ///
188 /// This is the only legitimate way to bring a `(rollout_id, hostname)`
189 /// record into existence. Subsequent transitions go through [`step`].
190 pub fn new_pending(
191 rollout_id: RolloutId,
192 hostname: String,
193 channel: String,
194 target_closure: ClosureHash,
195 dispatched_at: DateTime<Utc>,
196 soak_due_at: DateTime<Utc>,
197 ) -> Self {
198 Self {
199 rollout_id,
200 hostname,
201 channel,
202 state: HostState::Pending,
203 target_closure,
204 current_closure_at_dispatch: None,
205 current_closure: None,
206 reverted_to: None,
207 dispatched_at,
208 dispatch_acked_at: None,
209 activation_started_at: None,
210 activation_completed_at: None,
211 activation_failed_at: None,
212 probe_observed_first_at: None,
213 probe_failure_first_at: None,
214 soak_due_at: Some(soak_due_at),
215 converged_at: None,
216 failed_at: None,
217 policy_applied: None,
218 reverted_at: None,
219 probes: HashMap::new(),
220 last_event_seq: 0,
221 }
222 }
223}