nixfleet_state_machine/
effect.rs

1//! Reducer outputs. Descriptions of side effects, not executions.
2//!
3//! Per RFC-0006 §9: 4 agent-only variants, 5 CP-only variants, 3 shared.
4//! The agent applier (Phase 7) handles `Local*` + shared; the CP applier
5//! (Phase 6) handles `Remote*` + shared. The compiler's exhaustiveness
6//! check guarantees every variant has an arm in its applier — adding a
7//! variant fails the build at every applier that doesn't account for it.
8
9use chrono::{DateTime, Utc};
10use nixfleet_proto::OnHealthFailure;
11
12use crate::event::ProbeTopologyEntry;
13use crate::state::{
14    ClosureHash, HostState, ProbeMode, ProbeName, ProbeStatus, ProbeSubResult, RolloutId,
15};
16
17#[derive(Debug, Clone, PartialEq)]
18pub enum Effect {
19    // ─────────────────────────────────────────────────────────────────────
20    // Agent-only effects (CP applier returns Error if it sees these)
21    // ─────────────────────────────────────────────────────────────────────
22    /// Fire `switch-to-configuration` on `target`. Agent applier delegates
23    /// to the `activation` module which detaches via `systemd-run` per the
24    /// agent-process-restart contract.
25    LocalFireSwitch {
26        rollout_id: RolloutId,
27        target: ClosureHash,
28    },
29
30    /// Fire `switch-to-configuration` on `closure` (rollback target read
31    /// from `current_closure_at_dispatch`). Agent decides this without a
32    /// CP signal — manifest's `onHealthFailure` is the single signed
33    /// source of truth (RFC-0005 §4.1).
34    LocalFireRollbackTo {
35        rollout_id: RolloutId,
36        closure: ClosureHash,
37    },
38
39    /// Drop the in-memory probe cache for this `(rollout, host)` pair.
40    /// Emitted on `LocalActivationCompleted` so stale `Pass` results from
41    /// the prior closure cannot satisfy the new rollout's gates.
42    LocalResetProbeCache { rollout_id: RolloutId },
43
44    /// Emit an outbound event to CP via `POST /v1/agent/events`. `durable`
45    /// requests on-disk queuing before the network call so a crash between
46    /// the local state change and the POST is recoverable on restart
47    /// (RFC-0005 §9.7 — open question; default policy decided in Phase 7).
48    ///
49    /// `rollout_id` carries the rollout this event belongs to so the agent
50    /// applier can persist the outbound queue entry against the correct
51    /// `(host, rollout, seq)` triple without consulting a side channel.
52    /// Closes Phase 7's `enrich_effect_with_rollout` stopgap.
53    LocalEmitEvent {
54        rollout_id: RolloutId,
55        payload: OutboundAgentEvent,
56        durable: bool,
57    },
58
59    // ─────────────────────────────────────────────────────────────────────
60    // CP-only effects (agent applier returns Error if it sees these)
61    // ─────────────────────────────────────────────────────────────────────
62    /// Queue a Dispatch for the agent's next long-poll on
63    /// `/v1/agent/dispatch`. Pull-only — CP never opens a connection
64    /// (RFC-0005 §2.1).
65    RemoteQueueDispatch {
66        host: String,
67        rollout_id: RolloutId,
68        target_closure: ClosureHash,
69        soak_due_at: DateTime<Utc>,
70    },
71
72    /// Mark a closure as quarantined on a channel after a `RollbackComplete`
73    /// arrives. Subsequent dispatches refuse this closure on this channel.
74    RemoteInsertQuarantine {
75        channel: String,
76        closure: ClosureHash,
77    },
78
79    // No `RemoteClearStaleQuarantine` variant: quarantines are
80    // append-only under the derived-view discipline (RFC-0008 §6.4).
81    // Operator-driven clearance, if ever needed,
82    // becomes a future explicit event (mirrors `OperatorClearance`).
83    /// Persist a fresh `host_rollout_records` row when CP first dispatches
84    /// to a host for a new rollout (Phase 4 schema).
85    RemoteOpenRolloutRecord {
86        rollout_id: RolloutId,
87        channel: String,
88        host: String,
89    },
90
91    /// Append an inbound agent event to the audit log
92    /// (RFC-0005 §4.3 + the broader event-log pattern — every state
93    /// mutation, gate decision, manifest poll lands here too).
94    RemoteAppendEventLog {
95        host: String,
96        rollout_id: RolloutId,
97        payload: OutboundAgentEvent,
98    },
99
100    // ─────────────────────────────────────────────────────────────────────
101    // Shared effects (both runtimes handle these)
102    // ─────────────────────────────────────────────────────────────────────
103    /// Record a state transition (from, to, at) for the event log + status
104    /// API. Emitted on every legal `HostState` change.
105    RecordTransition {
106        host: String,
107        rollout_id: RolloutId,
108        from: HostState,
109        to: HostState,
110        at: DateTime<Utc>,
111    },
112
113    EmitMetric {
114        name: &'static str,
115        labels: Vec<(&'static str, String)>,
116        value: f64,
117    },
118
119    EmitLog {
120        level: LogLevel,
121        target: &'static str,
122        message: &'static str,
123        fields: Vec<(&'static str, String)>,
124    },
125}
126
127#[derive(Debug, Clone, Copy, PartialEq, Eq)]
128pub enum LogLevel {
129    Trace,
130    Debug,
131    Info,
132    Warn,
133    Error,
134}
135
136/// Outbound wire payloads (POST `/v1/agent/events`). Defined here for the
137/// reducer's `LocalEmitEvent` effect; Phase 6/7 lifts these into
138/// `nixfleet-proto::agent_wire` once the HTTP routes are wired.
139#[derive(Debug, Clone, PartialEq)]
140pub enum OutboundAgentEvent {
141    DispatchAck {
142        current_closure_at_dispatch: ClosureHash,
143        received_at: DateTime<Utc>,
144        seq: u64,
145    },
146    ActivationStarted {
147        started_at: DateTime<Utc>,
148        switch_method: String,
149        seq: u64,
150    },
151    ActivationCompleted {
152        observed_current_closure: ClosureHash,
153        exit_code: i32,
154        completed_at: DateTime<Utc>,
155        seq: u64,
156    },
157    ActivationFailed {
158        exit_code: i32,
159        stderr_tail: String,
160        failed_at: DateTime<Utc>,
161        seq: u64,
162    },
163    /// LIFT #2 (RFC-0005 §4.2): live activation skipped because
164    /// `component` (dbus/systemd/kernel/init) cannot be live-swapped on
165    /// a running system. Profile + bootloader updated; next reboot
166    /// completes the activation. Host stays at Activating until the
167    /// operator reboots; CP's handle_heartbeat (LIFT #1) synthesizes
168    /// the completion on the agent's next boot-recovery handshake.
169    /// Visibility-only at the wire level — replaces the pre-LIFT #2
170    /// fake-`ActivationCompleted` that lied with `exit_code = 0` and a
171    /// stale `observed_current_closure`.
172    ActivationDeferred {
173        component: String,
174        deferred_at: DateTime<Utc>,
175        seq: u64,
176    },
177    ProbeTopologyDeclared {
178        probes: Vec<ProbeTopologyEntry>,
179        declared_at: DateTime<Utc>,
180        seq: u64,
181    },
182    ProbeObservedFirst {
183        probe_name: ProbeName,
184        mode: ProbeMode,
185        observed_at: DateTime<Utc>,
186        seq: u64,
187    },
188    ProbeResult {
189        probe_name: ProbeName,
190        mode: ProbeMode,
191        status: ProbeStatus,
192        observed_at: DateTime<Utc>,
193        failure_reason: Option<String>,
194        /// `None` for non-evidence probes; `Some(vec)` for evidence
195        /// probes, carrying per-control sub-results. The applier's
196        /// `probe_failures` co-write iterates this to populate one row
197        /// per failing control (RFC-0007 §7.1 + §7.2).
198        sub_results: Option<Vec<ProbeSubResult>>,
199        seq: u64,
200    },
201    ProbeFailureFirst {
202        probe_name: ProbeName,
203        mode: ProbeMode,
204        first_failed_at: DateTime<Utc>,
205        seq: u64,
206    },
207    Failed {
208        failed_at: DateTime<Utc>,
209        sustained_duration_secs: u64,
210        failing_probes: Vec<ProbeName>,
211        policy_applied: OnHealthFailure,
212        seq: u64,
213    },
214    RollbackComplete {
215        reverted_to_closure: ClosureHash,
216        exit_code: i32,
217        completed_at: DateTime<Utc>,
218        seq: u64,
219    },
220    Converged {
221        converged_at: DateTime<Utc>,
222        current_closure: ClosureHash,
223        seq: u64,
224    },
225}