nixfleet_state_machine/
state.rs

1//! Per-(rollout, host) reducer state. Maps RFC-0005 §5 `HostRolloutRecord`.
2
3use std::collections::HashMap;
4
5use chrono::{DateTime, Utc};
6use nixfleet_proto::OnHealthFailure;
7use serde::{Deserialize, Serialize};
8
9// `RolloutId` lives in `nixfleet-proto` (RFC-0008 §6.3): a newtype
10// around `"{channel}@{channel_ref}"` with constructor discipline
11// analogous to `Verified<T>`. Re-exported from here so downstream
12// crates that already `use nixfleet_state_machine::RolloutId` work
13// without a renaming churn pass.
14pub use nixfleet_proto::RolloutId;
15
16pub type ClosureHash = String;
17pub type ProbeName = String;
18
19/// 6-state rollout machine per RFC-0005 §3. Replaces the pre-v0.2 9-variant
20/// `nixfleet_proto::HostRolloutState` (the `Queued` / `Dispatched` /
21/// `ConfirmWindow` / `Healthy` / `Soaked` variants are removed — phases 4-6
22/// delete the old enum entirely once the new machine is wired through).
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
24pub enum HostState {
25    /// CP has issued a Dispatch; agent has not yet acked.
26    Pending,
27    /// Agent acked; switch-to-configuration is firing or has fired pending
28    /// confirmation.
29    Activating,
30    /// Activation pipeline set the profile + bootloader but skipped the
31    /// live `switch-to-configuration` because a critical component
32    /// (dbus/systemd/kernel/init) cannot be live-swapped on a running
33    /// system. The new generation activates on next reboot. **Ordering-
34    /// eligible** (host-edges + wave-promotion + advance_current_waves
35    /// treat Deferred ≡ Converged for cascade-progression purposes —
36    /// the host has done what it can within the rollout step) but
37    /// **not health-verified** (probes haven't run against the new
38    /// closure; channel-edges stays strict and waits for actual
39    /// Converged). On operator reboot, the agent's boot-recovery
40    /// handshake observes `current_closure == target_closure` and
41    /// CP's `handle_heartbeat` synthesis (LIFT #1) drives
42    /// `Deferred → Soaking` via `RemoteActivationCompleted`.
43    Deferred,
44    /// Agent reports activation succeeded; probes have started; soak window
45    /// has not yet elapsed.
46    Soaking,
47    /// Soak elapsed, probes passing, `current == declared`. **Terminal for
48    /// ordering** (successor channels may release).
49    Converged,
50    /// Sustained probe failure observed by the agent and reported to CP.
51    /// Agent has read `onHealthFailure` from the signed manifest and decided
52    /// autonomously what comes next (RFC-0005 §4.2 `Failed` event).
53    Failed,
54    /// Agent has completed rollback to prior closure. Channel-level
55    /// quarantine holds the bad SHA.
56    Reverted,
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
60#[serde(rename_all = "kebab-case")]
61pub enum ProbeStatus {
62    Pass,
63    Fail,
64}
65
66/// Per-probe gate participation (RFC-0007 §3.4). Threaded through every
67/// probe event so CP can decide whether to gate on a result without
68/// consulting a separate topology table.
69///
70/// - `Enforce`  — wave gate consults latest result; Fail blocks promotion.
71/// - `Observe`  — result recorded in event_log; gate ignores it.
72/// - `Disabled` — declared but agent does not run it (operator suppression).
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
74#[serde(rename_all = "kebab-case")]
75pub enum ProbeMode {
76    Enforce,
77    Observe,
78    Disabled,
79}
80
81impl Default for ProbeMode {
82    /// Conservative default for persisted state that pre-dates the
83    /// per-probe `mode` field on `ProbeRecord`: assume `Enforce`. Old
84    /// state thus retains its gating semantics on rehydration; the next
85    /// probe event from the agent updates the record to the actually-
86    /// declared mode per RFC-0007 §3.4.
87    fn default() -> Self {
88        ProbeMode::Enforce
89    }
90}
91
92/// Per-control sub-result on a `kind = evidence` probe (RFC-0007 §7.1).
93/// `None` aggregate for non-evidence probes; `Some(vec)` for evidence
94/// probes — the applier's `probe_failures` co-write iterates `sub_results`
95/// to populate one row per failing control_id.
96#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
97pub struct ProbeSubResult {
98    pub control_id: String,
99    pub status: ProbeStatus,
100    /// Framework label (e.g. "nis2-essential"). Echoes the probe
101    /// declaration's `framework` field.
102    pub framework: String,
103    /// Framework-specific control reference (e.g. "nis2:21(b)").
104    pub article: Option<String>,
105    /// Per-control effective mode after applying the probe's
106    /// `controlOverrides` / `controls` map. The compliance_wave gate
107    /// counts only `Enforce`-mode sub_results; `Observe` is recorded
108    /// for visibility but does not gate. `Disabled` controls are
109    /// dropped from sub_results entirely at the agent runner.
110    /// `serde(default)` falls back to the probe-level mode for events
111    /// emitted by pre-override-aware agents.
112    #[serde(default)]
113    pub effective_mode: ProbeMode,
114    /// Operator-declared audit rationale for the override, sourced
115    /// from `controlOverrides[control].reason` or
116    /// `controls[control].reason` in fleet.nix. `None` when no
117    /// override applies (effective_mode equals probe-level mode);
118    /// `Some("")` when the operator declared an override but left
119    /// the rationale blank. CP writes this verbatim into event_log
120    /// payloads so auditors recover "why was this control
121    /// downgraded" from the signed event stream alone.
122    #[serde(default, skip_serializing_if = "Option::is_none")]
123    pub override_reason: Option<String>,
124}
125
126#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
127pub struct ProbeRecord {
128    pub status: ProbeStatus,
129    /// Per-probe gate participation per RFC-0007 §3.4. Only
130    /// `Enforce`-mode probes contribute to the soak-gate `failing_probes`
131    /// builder; `Observe` and `Disabled` record events but do not gate.
132    /// `#[serde(default)]` (-> `Enforce`) keeps rehydration safe for
133    /// state persisted before this field existed.
134    #[serde(default)]
135    pub mode: ProbeMode,
136    pub last_observed_at: DateTime<Utc>,
137    pub last_pass_at: Option<DateTime<Utc>>,
138    pub failure_reason: Option<String>,
139}
140
141/// Per-(rollout, host) reducer state. Mirrors RFC-0005 §5
142/// `HostRolloutRecord`; the persistence schema in Phase 4 serializes this
143/// struct.
144///
145/// Every transition timestamp is agent-supplied (received via the wire and
146/// stamped onto the corresponding field by the reducer). `dispatched_at` is
147/// the lone exception — it's CP-issued, so CP wallclock is the source of
148/// truth there (RFC-0005 §5).
149#[derive(Debug, Clone)]
150pub struct HostRolloutState {
151    pub rollout_id: RolloutId,
152    pub hostname: String,
153    pub channel: String,
154    pub state: HostState,
155
156    // Closures
157    pub target_closure: ClosureHash,
158    pub current_closure_at_dispatch: Option<ClosureHash>,
159    pub current_closure: Option<ClosureHash>,
160    pub reverted_to: Option<ClosureHash>,
161
162    // Transition timestamps (agent-supplied unless noted)
163    pub dispatched_at: DateTime<Utc>,
164    pub dispatch_acked_at: Option<DateTime<Utc>>,
165    pub activation_started_at: Option<DateTime<Utc>>,
166    pub activation_completed_at: Option<DateTime<Utc>>,
167    pub activation_failed_at: Option<DateTime<Utc>>,
168    pub probe_observed_first_at: Option<DateTime<Utc>>,
169    pub probe_failure_first_at: Option<DateTime<Utc>>,
170    pub soak_due_at: Option<DateTime<Utc>>,
171    pub converged_at: Option<DateTime<Utc>>,
172    pub failed_at: Option<DateTime<Utc>>,
173    pub policy_applied: Option<OnHealthFailure>,
174    pub reverted_at: Option<DateTime<Utc>>,
175
176    // Live probe state, by probe name
177    pub probes: HashMap<ProbeName, ProbeRecord>,
178
179    /// Monotonic per (hostname, rollout_id). Gaps signal lost events;
180    /// out-of-order events are dropped with a warning at the runtime layer.
181    pub last_event_seq: u64,
182}
183
184impl HostRolloutState {
185    /// Construct the initial `Pending` state when CP queues a Dispatch (or
186    /// the agent receives one via long-poll on `/v1/agent/dispatch`).
187    ///
188    /// This is the only legitimate way to bring a `(rollout_id, hostname)`
189    /// record into existence. Subsequent transitions go through [`step`].
190    pub fn new_pending(
191        rollout_id: RolloutId,
192        hostname: String,
193        channel: String,
194        target_closure: ClosureHash,
195        dispatched_at: DateTime<Utc>,
196        soak_due_at: DateTime<Utc>,
197    ) -> Self {
198        Self {
199            rollout_id,
200            hostname,
201            channel,
202            state: HostState::Pending,
203            target_closure,
204            current_closure_at_dispatch: None,
205            current_closure: None,
206            reverted_to: None,
207            dispatched_at,
208            dispatch_acked_at: None,
209            activation_started_at: None,
210            activation_completed_at: None,
211            activation_failed_at: None,
212            probe_observed_first_at: None,
213            probe_failure_first_at: None,
214            soak_due_at: Some(soak_due_at),
215            converged_at: None,
216            failed_at: None,
217            policy_applied: None,
218            reverted_at: None,
219            probes: HashMap::new(),
220            last_event_seq: 0,
221        }
222    }
223}