nixfleet_reconciler/planner_gates/
mod.rs

1//! Planner gates over `&FleetState` (the v0.2 shape, replacing the
2//! v0.1 `&Observed`-based gates per RFC-0006 §12).
3//!
4//! Two design properties the new shape enforces that the old didn't:
5//!
6//! - **No fail-open defaults.** The old gates carried
7//!   `unwrap_or(true)` for missing probe state (see
8//!   `Observed::host_probes_passing` docstring). The new gates consult
9//!   the reducer state — `HostRolloutState::probe_observed_first_at`
10//!   etc. — where absence has explicit meaning (probe hasn't run yet =
11//!   soak gate fails closed). RFC-0005 §6.
12//!
13//! - **Verified manifests only.** Every gate takes `&SignedManifestSet`.
14//!   Phase 2's `Verified<T>` newtype graduates from "type exists" to
15//!   "type is required on the dispatch path".
16//!
17//! First-block-wins order matches the old `evaluate_for_host`:
18//! `quarantine → channel_edges → wave_promotion → host_edges →
19//! disruption_budget → compliance_wave`. Quarantine is FIRST: a hash
20//! that just rolled back must stop instantly even if other gates would
21//! otherwise hold the host — otherwise the agent re-fetches and re-
22//! activates the bad closure on every cycle.
23
24pub mod channel_edges;
25pub mod compliance_wave;
26pub mod disruption_budget;
27pub mod host_edges;
28pub mod quarantine;
29pub mod wave_promotion;
30
31#[cfg(test)]
32mod tests;
33
34use crate::planner_types::{
35    ChannelId, ClosureHash, FleetState, HostId, QuarantineSet, RolloutId, SignedManifestSet,
36};
37
38/// Reason a host can't be dispatched right now. Variants carry enough
39/// detail to render the log line + observability event without re-
40/// querying state. The legacy `&Observed`-shaped gate variants from
41/// v0.1 are not represented.
42#[derive(Debug, Clone, PartialEq, Eq)]
43pub enum GateBlock {
44    ChannelEdges {
45        predecessor_channel: String,
46    },
47    WavePromotion {
48        host_wave: u32,
49        current_wave: u32,
50    },
51    /// Default-deny variant of the wave-promotion gate: the channel
52    /// declares waves but this host is not listed in any of them.
53    /// Pre-hardening this case silently passed (`position(...)` returns
54    /// None → `?`-early-return → gate returns None → host dispatches).
55    /// That's an operator misconfiguration (host should be in a wave;
56    /// either an operator forgot to assign it, or a fleet-build bug
57    /// dropped it). Default-deny prevents silent dispatch of an
58    /// unstaged host.
59    HostUnstaged {
60        channel: String,
61    },
62    HostEdge {
63        gating_host: String,
64    },
65    DisruptionBudget {
66        in_flight: u32,
67        max: u32,
68        selector_summary: String,
69    },
70    ComplianceWave {
71        failing_events_count: usize,
72        host_wave: u32,
73    },
74    Quarantined {
75        channel: String,
76        closure_hash: String,
77    },
78}
79
80impl GateBlock {
81    /// Short human-readable reason for log lines + telemetry payloads.
82    pub fn reason(&self) -> String {
83        match self {
84            GateBlock::ChannelEdges {
85                predecessor_channel,
86            } => {
87                format!("channelEdges predecessor channel '{predecessor_channel}' not converged")
88            }
89            GateBlock::WavePromotion {
90                host_wave,
91                current_wave,
92            } => {
93                format!("wave-promotion: host_wave={host_wave} > current_wave={current_wave}")
94            }
95            GateBlock::HostUnstaged { channel } => {
96                format!(
97                    "wave-promotion: channel '{channel}' declares waves but this host is not assigned to any of them — fix the fleet declaration or the host's tag/wave selector"
98                )
99            }
100            GateBlock::HostEdge { gating_host } => {
101                format!("host-edge: gating host '{gating_host}' not yet Converged")
102            }
103            GateBlock::DisruptionBudget {
104                in_flight,
105                max,
106                selector_summary,
107            } => format!("disruption-budget: {in_flight}/{max} in flight ({selector_summary})"),
108            GateBlock::ComplianceWave {
109                failing_events_count,
110                host_wave,
111            } => format!(
112                "compliance-wave: {failing_events_count} outstanding failure(s) on hosts in wave < {host_wave}"
113            ),
114            GateBlock::Quarantined {
115                channel,
116                closure_hash,
117            } => format!(
118                "channel {channel} closure {closure_hash} quarantined (sustained probe failures); push a new closure to clear"
119            ),
120        }
121    }
122
123    /// Stable kebab-case discriminator for telemetry.
124    pub fn discriminator(&self) -> &'static str {
125        match self {
126            GateBlock::ChannelEdges { .. } => "channel-edges",
127            GateBlock::WavePromotion { .. } => "wave-promotion",
128            GateBlock::HostUnstaged { .. } => "wave-promotion-host-unstaged",
129            GateBlock::HostEdge { .. } => "host-edge",
130            GateBlock::DisruptionBudget { .. } => "disruption-budget",
131            GateBlock::ComplianceWave { .. } => "compliance-wave",
132            GateBlock::Quarantined { .. } => "quarantine",
133        }
134    }
135}
136
137/// First block wins. Cheapest-first; quarantine is FIRST for the
138/// anti-thrash property (see module docstring).
139///
140/// `tick_dispatched` is the per-budget counter the planner maintains
141/// across a single `plan_next()` call so within-tick over-commit is
142/// caught (see `planner_gates::disruption_budget` for the rationale).
143/// Empty map = first host in the tick.
144//
145// Eight parameters because each gate's input set is load-bearing and
146// distinct; bundling into a `GateCtx` struct would just move the
147// argument count without removing any of it. Clippy's heuristic
148// doesn't fit a single dispatch entrypoint of this shape.
149#[allow(clippy::too_many_arguments)]
150pub fn evaluate_for_dispatch(
151    fleet_state: &FleetState,
152    manifests: &SignedManifestSet,
153    quarantines: &QuarantineSet,
154    rollout_id: &RolloutId,
155    host: &HostId,
156    target_closure: &ClosureHash,
157    channel: &ChannelId,
158    tick_dispatched: &std::collections::HashMap<disruption_budget::BudgetId, u32>,
159) -> Option<GateBlock> {
160    if let Some(b) = quarantine::check(quarantines, channel, target_closure) {
161        return Some(b);
162    }
163    if let Some(b) = channel_edges::check(fleet_state, manifests, channel) {
164        return Some(b);
165    }
166    if let Some(b) = wave_promotion::check(fleet_state, manifests, host, rollout_id) {
167        return Some(b);
168    }
169    if let Some(b) = host_edges::check(fleet_state, manifests, host, rollout_id) {
170        return Some(b);
171    }
172    if let Some(b) = disruption_budget::check(fleet_state, rollout_id, host, tick_dispatched) {
173        return Some(b);
174    }
175    if let Some(b) = compliance_wave::check(fleet_state, manifests, host, rollout_id) {
176        return Some(b);
177    }
178    None
179}