nixfleet_reconciler/planner_gates/disruption_budget.rs
1//! Disruption-budget gate (new-shape). `max_in_flight` enforced at
2//! dispatch time, summed across all active rollouts whose budgets share
3//! a selector (matches the old gate's "max one workstation in flight,
4//! ever" cross-rollout semantics).
5//!
6//! LOADBEARING: "in-flight" = `{Activating, Soaking}` (RFC-0002 §3).
7//! `Pending` is explicitly NOT in-flight — a Pending host has not
8//! received its Dispatch yet; counting it as in-flight causes a
9//! self-block where a freshly-`OpenRollout`'d host saturates the
10//! budget against its own Pending status and never transitions to
11//! Activating. `Failed`/`Reverted`/`Converged` are terminal and also
12//! not in-flight.
13//!
14//! ## Within-tick accumulator
15//!
16//! Excluding Pending from `is_in_flight` exposes a within-tick
17//! over-commit risk: a single `plan_next()` tick can emit N
18//! `QueueDispatch`es for N Pending hosts, each seeing `in_flight = 0`
19//! at gate-check time because none have transitioned to Activating
20//! yet. With `max_in_flight = 1` that's an N-fold over-commit.
21//!
22//! The planner threads `tick_dispatched: &HashMap<Selector, u32>`
23//! through `evaluate_for_dispatch` and increments the per-budget
24//! count after each `QueueDispatch`. The gate consults
25//! `in_flight + tick_count` against `max`.
26
27use std::collections::HashMap;
28
29use nixfleet_proto::Selector;
30use nixfleet_state_machine::HostState;
31
32use crate::planner_gates::GateBlock;
33use crate::planner_types::{FleetState, HostId, RolloutId};
34
35/// Key used by `plan_next`'s within-tick accumulator. The same
36/// `Selector` value that identifies a budget across rollouts also
37/// keys the per-tick counter — one budget identity, one place.
38pub type BudgetId = Selector;
39
40pub fn check(
41 fleet_state: &FleetState,
42 rollout_id: &RolloutId,
43 host: &HostId,
44 tick_dispatched: &HashMap<BudgetId, u32>,
45) -> Option<GateBlock> {
46 let rollout = fleet_state.rollouts.get(rollout_id)?;
47
48 for budget in &rollout.budgets {
49 if !budget.hosts.iter().any(|h| h == host) {
50 continue;
51 }
52 let max = match budget.max_in_flight {
53 Some(m) => m,
54 None => continue,
55 };
56 let in_flight = in_flight_count(fleet_state, &budget.selector);
57 let pending_this_tick = tick_dispatched.get(&budget.selector).copied().unwrap_or(0);
58 let total = in_flight.saturating_add(pending_this_tick);
59 if total >= max {
60 return Some(GateBlock::DisruptionBudget {
61 in_flight: total,
62 max,
63 selector_summary: budget.selector.summary(),
64 });
65 }
66 }
67 None
68}
69
70fn in_flight_count(fleet_state: &FleetState, selector: &Selector) -> u32 {
71 let mut count: u32 = 0;
72 for (rollout_id, summary) in &fleet_state.rollouts {
73 let Some(matching_budget) = summary.budgets.iter().find(|b| &b.selector == selector) else {
74 continue;
75 };
76 for ((rid, hostname), state) in &fleet_state.host_states {
77 if rid != rollout_id {
78 continue;
79 }
80 if !matching_budget.hosts.iter().any(|h| h == hostname) {
81 continue;
82 }
83 if is_in_flight(state.state) {
84 count += 1;
85 }
86 }
87 }
88 count
89}
90
91fn is_in_flight(state: HostState) -> bool {
92 // LOADBEARING: `Pending` is NOT in-flight. A Pending host has not
93 // acked a Dispatch yet (and may not have been issued one in this
94 // tick). Counting it as in-flight self-blocks the host from
95 // transitioning to Activating.
96 matches!(state, HostState::Activating | HostState::Soaking)
97}