nixfleet_reconciler/planner_gates/
disruption_budget.rs

1//! Disruption-budget gate (new-shape). `max_in_flight` enforced at
2//! dispatch time, summed across all active rollouts whose budgets share
3//! a selector (matches the old gate's "max one workstation in flight,
4//! ever" cross-rollout semantics).
5//!
6//! LOADBEARING: "in-flight" = `{Activating, Soaking}` (RFC-0002 §3).
7//! `Pending` is explicitly NOT in-flight — a Pending host has not
8//! received its Dispatch yet; counting it as in-flight causes a
9//! self-block where a freshly-`OpenRollout`'d host saturates the
10//! budget against its own Pending status and never transitions to
11//! Activating. `Failed`/`Reverted`/`Converged` are terminal and also
12//! not in-flight.
13//!
14//! ## Within-tick accumulator
15//!
16//! Excluding Pending from `is_in_flight` exposes a within-tick
17//! over-commit risk: a single `plan_next()` tick can emit N
18//! `QueueDispatch`es for N Pending hosts, each seeing `in_flight = 0`
19//! at gate-check time because none have transitioned to Activating
20//! yet. With `max_in_flight = 1` that's an N-fold over-commit.
21//!
22//! The planner threads `tick_dispatched: &HashMap<Selector, u32>`
23//! through `evaluate_for_dispatch` and increments the per-budget
24//! count after each `QueueDispatch`. The gate consults
25//! `in_flight + tick_count` against `max`.
26
27use std::collections::HashMap;
28
29use nixfleet_proto::Selector;
30use nixfleet_state_machine::HostState;
31
32use crate::planner_gates::GateBlock;
33use crate::planner_types::{FleetState, HostId, RolloutId};
34
35/// Key used by `plan_next`'s within-tick accumulator. The same
36/// `Selector` value that identifies a budget across rollouts also
37/// keys the per-tick counter — one budget identity, one place.
38pub type BudgetId = Selector;
39
40pub fn check(
41    fleet_state: &FleetState,
42    rollout_id: &RolloutId,
43    host: &HostId,
44    tick_dispatched: &HashMap<BudgetId, u32>,
45) -> Option<GateBlock> {
46    let rollout = fleet_state.rollouts.get(rollout_id)?;
47
48    for budget in &rollout.budgets {
49        if !budget.hosts.iter().any(|h| h == host) {
50            continue;
51        }
52        let max = match budget.max_in_flight {
53            Some(m) => m,
54            None => continue,
55        };
56        let in_flight = in_flight_count(fleet_state, &budget.selector);
57        let pending_this_tick = tick_dispatched.get(&budget.selector).copied().unwrap_or(0);
58        let total = in_flight.saturating_add(pending_this_tick);
59        if total >= max {
60            return Some(GateBlock::DisruptionBudget {
61                in_flight: total,
62                max,
63                selector_summary: budget.selector.summary(),
64            });
65        }
66    }
67    None
68}
69
70fn in_flight_count(fleet_state: &FleetState, selector: &Selector) -> u32 {
71    let mut count: u32 = 0;
72    for (rollout_id, summary) in &fleet_state.rollouts {
73        let Some(matching_budget) = summary.budgets.iter().find(|b| &b.selector == selector) else {
74            continue;
75        };
76        for ((rid, hostname), state) in &fleet_state.host_states {
77            if rid != rollout_id {
78                continue;
79            }
80            if !matching_budget.hosts.iter().any(|h| h == hostname) {
81                continue;
82            }
83            if is_in_flight(state.state) {
84                count += 1;
85            }
86        }
87    }
88    count
89}
90
91fn is_in_flight(state: HostState) -> bool {
92    // LOADBEARING: `Pending` is NOT in-flight. A Pending host has not
93    // acked a Dispatch yet (and may not have been issued one in this
94    // tick). Counting it as in-flight self-blocks the host from
95    // transitioning to Activating.
96    matches!(state, HostState::Activating | HostState::Soaking)
97}