nixfleet_state_machine/effect.rs
1//! Reducer outputs. Descriptions of side effects, not executions.
2//!
3//! Per RFC-0006 §9: 4 agent-only variants, 5 CP-only variants, 3 shared.
4//! The agent applier (Phase 7) handles `Local*` + shared; the CP applier
5//! (Phase 6) handles `Remote*` + shared. The compiler's exhaustiveness
6//! check guarantees every variant has an arm in its applier — adding a
7//! variant fails the build at every applier that doesn't account for it.
8
9use chrono::{DateTime, Utc};
10use nixfleet_proto::OnHealthFailure;
11
12use crate::event::ProbeTopologyEntry;
13use crate::state::{
14 ClosureHash, HostState, ProbeMode, ProbeName, ProbeStatus, ProbeSubResult, RolloutId,
15};
16
17#[derive(Debug, Clone, PartialEq)]
18pub enum Effect {
19 // ─────────────────────────────────────────────────────────────────────
20 // Agent-only effects (CP applier returns Error if it sees these)
21 // ─────────────────────────────────────────────────────────────────────
22 /// Fire `switch-to-configuration` on `target`. Agent applier delegates
23 /// to the `activation` module which detaches via `systemd-run` per the
24 /// agent-process-restart contract.
25 LocalFireSwitch {
26 rollout_id: RolloutId,
27 target: ClosureHash,
28 },
29
30 /// Fire `switch-to-configuration` on `closure` (rollback target read
31 /// from `current_closure_at_dispatch`). Agent decides this without a
32 /// CP signal — manifest's `onHealthFailure` is the single signed
33 /// source of truth (RFC-0005 §4.1).
34 LocalFireRollbackTo {
35 rollout_id: RolloutId,
36 closure: ClosureHash,
37 },
38
39 /// Drop the in-memory probe cache for this `(rollout, host)` pair.
40 /// Emitted on `LocalActivationCompleted` so stale `Pass` results from
41 /// the prior closure cannot satisfy the new rollout's gates.
42 LocalResetProbeCache { rollout_id: RolloutId },
43
44 /// Emit an outbound event to CP via `POST /v1/agent/events`. `durable`
45 /// requests on-disk queuing before the network call so a crash between
46 /// the local state change and the POST is recoverable on restart
47 /// (RFC-0005 §9.7 — open question; default policy decided in Phase 7).
48 ///
49 /// `rollout_id` carries the rollout this event belongs to so the agent
50 /// applier can persist the outbound queue entry against the correct
51 /// `(host, rollout, seq)` triple without consulting a side channel.
52 /// Closes Phase 7's `enrich_effect_with_rollout` stopgap.
53 LocalEmitEvent {
54 rollout_id: RolloutId,
55 payload: OutboundAgentEvent,
56 durable: bool,
57 },
58
59 // ─────────────────────────────────────────────────────────────────────
60 // CP-only effects (agent applier returns Error if it sees these)
61 // ─────────────────────────────────────────────────────────────────────
62 /// Queue a Dispatch for the agent's next long-poll on
63 /// `/v1/agent/dispatch`. Pull-only — CP never opens a connection
64 /// (RFC-0005 §2.1).
65 RemoteQueueDispatch {
66 host: String,
67 rollout_id: RolloutId,
68 target_closure: ClosureHash,
69 soak_due_at: DateTime<Utc>,
70 },
71
72 /// Mark a closure as quarantined on a channel after a `RollbackComplete`
73 /// arrives. Subsequent dispatches refuse this closure on this channel.
74 RemoteInsertQuarantine {
75 channel: String,
76 closure: ClosureHash,
77 },
78
79 // No `RemoteClearStaleQuarantine` variant: quarantines are
80 // append-only under the derived-view discipline (RFC-0008 §6.4).
81 // Operator-driven clearance, if ever needed,
82 // becomes a future explicit event (mirrors `OperatorClearance`).
83 /// Persist a fresh `host_rollout_records` row when CP first dispatches
84 /// to a host for a new rollout (Phase 4 schema).
85 RemoteOpenRolloutRecord {
86 rollout_id: RolloutId,
87 channel: String,
88 host: String,
89 },
90
91 /// Append an inbound agent event to the audit log
92 /// (RFC-0005 §4.3 + the broader event-log pattern — every state
93 /// mutation, gate decision, manifest poll lands here too).
94 RemoteAppendEventLog {
95 host: String,
96 rollout_id: RolloutId,
97 payload: OutboundAgentEvent,
98 },
99
100 // ─────────────────────────────────────────────────────────────────────
101 // Shared effects (both runtimes handle these)
102 // ─────────────────────────────────────────────────────────────────────
103 /// Record a state transition (from, to, at) for the event log + status
104 /// API. Emitted on every legal `HostState` change.
105 RecordTransition {
106 host: String,
107 rollout_id: RolloutId,
108 from: HostState,
109 to: HostState,
110 at: DateTime<Utc>,
111 },
112
113 EmitMetric {
114 name: &'static str,
115 labels: Vec<(&'static str, String)>,
116 value: f64,
117 },
118
119 EmitLog {
120 level: LogLevel,
121 target: &'static str,
122 message: &'static str,
123 fields: Vec<(&'static str, String)>,
124 },
125}
126
127#[derive(Debug, Clone, Copy, PartialEq, Eq)]
128pub enum LogLevel {
129 Trace,
130 Debug,
131 Info,
132 Warn,
133 Error,
134}
135
136/// Outbound wire payloads (POST `/v1/agent/events`). Defined here for the
137/// reducer's `LocalEmitEvent` effect; Phase 6/7 lifts these into
138/// `nixfleet-proto::agent_wire` once the HTTP routes are wired.
139#[derive(Debug, Clone, PartialEq)]
140pub enum OutboundAgentEvent {
141 DispatchAck {
142 current_closure_at_dispatch: ClosureHash,
143 received_at: DateTime<Utc>,
144 seq: u64,
145 },
146 ActivationStarted {
147 started_at: DateTime<Utc>,
148 switch_method: String,
149 seq: u64,
150 },
151 ActivationCompleted {
152 observed_current_closure: ClosureHash,
153 exit_code: i32,
154 completed_at: DateTime<Utc>,
155 seq: u64,
156 },
157 ActivationFailed {
158 exit_code: i32,
159 stderr_tail: String,
160 failed_at: DateTime<Utc>,
161 seq: u64,
162 },
163 /// LIFT #2 (RFC-0005 §4.2): live activation skipped because
164 /// `component` (dbus/systemd/kernel/init) cannot be live-swapped on
165 /// a running system. Profile + bootloader updated; next reboot
166 /// completes the activation. Host stays at Activating until the
167 /// operator reboots; CP's handle_heartbeat (LIFT #1) synthesizes
168 /// the completion on the agent's next boot-recovery handshake.
169 /// Visibility-only at the wire level — replaces the pre-LIFT #2
170 /// fake-`ActivationCompleted` that lied with `exit_code = 0` and a
171 /// stale `observed_current_closure`.
172 ActivationDeferred {
173 component: String,
174 deferred_at: DateTime<Utc>,
175 seq: u64,
176 },
177 ProbeTopologyDeclared {
178 probes: Vec<ProbeTopologyEntry>,
179 declared_at: DateTime<Utc>,
180 seq: u64,
181 },
182 ProbeObservedFirst {
183 probe_name: ProbeName,
184 mode: ProbeMode,
185 observed_at: DateTime<Utc>,
186 seq: u64,
187 },
188 ProbeResult {
189 probe_name: ProbeName,
190 mode: ProbeMode,
191 status: ProbeStatus,
192 observed_at: DateTime<Utc>,
193 failure_reason: Option<String>,
194 /// `None` for non-evidence probes; `Some(vec)` for evidence
195 /// probes, carrying per-control sub-results. The applier's
196 /// `probe_failures` co-write iterates this to populate one row
197 /// per failing control (RFC-0007 §7.1 + §7.2).
198 sub_results: Option<Vec<ProbeSubResult>>,
199 seq: u64,
200 },
201 ProbeFailureFirst {
202 probe_name: ProbeName,
203 mode: ProbeMode,
204 first_failed_at: DateTime<Utc>,
205 seq: u64,
206 },
207 Failed {
208 failed_at: DateTime<Utc>,
209 sustained_duration_secs: u64,
210 failing_probes: Vec<ProbeName>,
211 policy_applied: OnHealthFailure,
212 seq: u64,
213 },
214 RollbackComplete {
215 reverted_to_closure: ClosureHash,
216 exit_code: i32,
217 completed_at: DateTime<Utc>,
218 seq: u64,
219 },
220 Converged {
221 converged_at: DateTime<Utc>,
222 current_closure: ClosureHash,
223 seq: u64,
224 },
225}