nixfleet_control_plane/server/state.rs
1//! Shared state + configuration types for the long-running server.
2
3use std::net::SocketAddr;
4use std::path::PathBuf;
5use std::sync::Arc;
6use std::sync::OnceLock;
7use std::sync::atomic::{AtomicBool, Ordering};
8use std::time::Duration;
9
10use chrono::{DateTime, Utc};
11use nixfleet_proto::FleetResolved;
12use tokio::sync::{RwLock, mpsc};
13
14/// Must exceed agent poll budget (~300s) plus slack to avoid magic-rollback / agent-poll races.
15pub const DEFAULT_CONFIRM_DEADLINE_SECS: i64 = 360;
16
17/// `Default` defaults are bogus on purpose; prod paths fail at first IO if
18/// clap parsing is skipped.
19#[derive(Debug, Clone)]
20pub struct ServeArgs {
21 pub listen: SocketAddr,
22 pub tls_cert: PathBuf,
23 pub tls_key: PathBuf,
24 pub client_ca: Option<PathBuf>,
25 /// Often the same path as `client_ca`.
26 pub fleet_ca_cert: Option<PathBuf>,
27 /// File-backed CA signer's private key PEM. TPM (pubkey + wrapper) wins.
28 pub fleet_ca_key: Option<PathBuf>,
29 /// TPM-backed CA signer: keyslot scope's `pubkey.raw` (64 raw P-256 X||Y).
30 pub tpm_ca_pubkey_raw: Option<PathBuf>,
31 /// TPM-backed CA signer: keyslot scope's `tpm-sign-<keyname>` wrapper.
32 pub tpm_ca_sign_wrapper: Option<PathBuf>,
33 /// Permit the file-backed CA-issuance backend under `--strict`. Default
34 /// `false`: in strict mode, the file backend is refused unless TPM is
35 /// also configured (in which case TPM wins) or this flag is set
36 /// explicitly. See RFC-0010 §1.5.1.
37 pub allow_file_ca_key: bool,
38 pub audit_log_path: Option<PathBuf>,
39 pub artifact_path: PathBuf,
40 pub signature_path: PathBuf,
41 pub trust_path: PathBuf,
42 /// File-backed fallback used only when no agents checked in AND `channel_refs` is None.
43 pub observed_path: PathBuf,
44 pub freshness_window: Duration,
45 pub confirm_deadline_secs: i64,
46 /// `None` -> file-backed `--artifact` only.
47 pub channel_refs: Option<crate::runtime::workers::manifest_poll::ChannelRefsSource>,
48 pub revocations: Option<crate::polling::revocations_poll::RevocationsSource>,
49 pub bootstrap_nonces: Option<crate::polling::bootstrap_nonces_poll::BootstrapNoncesSource>,
50 /// `None` -> in-memory state only.
51 pub db_path: Option<PathBuf>,
52 /// `None` -> `/v1/agent/closure/<hash>` returns 501.
53 pub closure_upstream: Option<String>,
54 /// Pre-signed `<rolloutId>.{json,sig}` pairs; falls back to `rollouts_source`, then 503.
55 pub rollouts_dir: Option<PathBuf>,
56 /// HTTP-fetched manifests; required when `nixfleet-release` writes manifests post-build.
57 pub rollouts_source: Option<crate::rollouts_source::RolloutsSource>,
58 /// Refuse to start when any security-fallback flag is unset.
59 pub strict: bool,
60 /// `agent-<machineId>.<suffix>` for issued cert CNs. Must match the
61 /// issuance CA's `dNSName` name constraint (D14).
62 pub agent_cn_suffix: String,
63 /// Validity baked into agent certs at enroll + renew. Default 30d;
64 /// shortened by operators for hardware testing of renewal flows.
65 pub agent_cert_validity: Duration,
66 /// Test-only: skip the readiness gate so endpoint tests don't have to
67 /// drive a real channel-refs poll. Production paths MUST leave `false`;
68 /// the CLI never sets it.
69 pub mark_ready_at_startup: bool,
70 /// Test-only: seed the in-memory bootstrap-nonces allowlist at startup
71 /// without running the poll loop. Production paths MUST leave `None`;
72 /// the CLI never sets it.
73 pub initial_nonces: Option<crate::db::allowed_nonces::AllowedNoncesView>,
74}
75
76impl Default for ServeArgs {
77 fn default() -> Self {
78 Self {
79 listen: "127.0.0.1:0".parse().expect("static loopback addr"),
80 tls_cert: PathBuf::new(),
81 tls_key: PathBuf::new(),
82 client_ca: None,
83 fleet_ca_cert: None,
84 fleet_ca_key: None,
85 tpm_ca_pubkey_raw: None,
86 tpm_ca_sign_wrapper: None,
87 allow_file_ca_key: false,
88 audit_log_path: None,
89 artifact_path: PathBuf::new(),
90 signature_path: PathBuf::new(),
91 trust_path: PathBuf::new(),
92 observed_path: PathBuf::new(),
93 freshness_window: Duration::from_secs(2_592_000),
94 confirm_deadline_secs: DEFAULT_CONFIRM_DEADLINE_SECS,
95 channel_refs: None,
96 revocations: None,
97 bootstrap_nonces: None,
98 db_path: None,
99 closure_upstream: None,
100 rollouts_dir: None,
101 rollouts_source: None,
102 strict: false,
103 agent_cn_suffix: crate::auth::issuance::DEFAULT_AGENT_CN_SUFFIX.to_string(),
104 agent_cert_validity: crate::auth::issuance::AGENT_CERT_VALIDITY,
105 mark_ready_at_startup: false,
106 initial_nonces: None,
107 }
108 }
109}
110
111/// `(fleet, hash, raw_bytes)` tuple under one lock prevents readers
112/// seeing fresh fleet with stale hash or stale bytes. `artifact_bytes` +
113/// `signature_bytes` are the canonical signed bytes the
114/// `cp_manifest_poll` worker fetched + verified; the
115/// `/v1/fleet.resolved` route serves them directly so agents see the
116/// exact bytes CP verified, not stale closure-embedded content from the
117/// `--artifact` flag.
118#[derive(Clone, Debug)]
119pub struct VerifiedFleetSnapshot {
120 pub fleet: Arc<FleetResolved>,
121 pub fleet_resolved_hash: String,
122 pub artifact_bytes: Vec<u8>,
123 pub signature_bytes: Vec<u8>,
124}
125
126#[derive(Clone, Debug)]
127pub struct ClosureUpstream {
128 pub base_url: String,
129 pub client: reqwest::Client,
130}
131
132#[derive(Debug, Clone, Default)]
133pub struct IssuancePaths {
134 pub fleet_ca_cert: Option<PathBuf>,
135 pub fleet_ca_key: Option<PathBuf>,
136 pub audit_log: Option<PathBuf>,
137 /// Path to the daemon-configured trust.json (the `--trust-file`
138 /// flag). Both `/v1/enroll` (orgRootKey signature verify) and
139 /// `/v1/agent/bootstrap-report` (same) read this. The polling
140 /// loops have their own copy via `ChannelRefsSource.trust_path`
141 /// - they all point at the same file in production. ONE source
142 /// of truth for the daemon's trust roots, not derived from
143 /// fleet_ca_cert (which broke when operators placed
144 /// fleet-ca.pem outside `/etc/nixfleet/cp/`).
145 pub trust_path: PathBuf,
146}
147
148pub struct AppState {
149 pub last_tick_at: RwLock<Option<DateTime<Utc>>>,
150 pub issuance_paths: RwLock<IssuancePaths>,
151 /// Built once at server start from `ServeArgs` - `TpmCaSigner` if
152 /// the TPM flags are set, `FileCaSigner` otherwise, `None` if no
153 /// CA flags supplied (enroll/renew return 500). `dyn` lets enroll
154 /// + renew handlers stay agnostic to signing backend.
155 pub ca_signer: RwLock<Option<Arc<dyn crate::auth::issuance::CaSigner>>>,
156 pub db: Option<Arc<crate::db::Db>>,
157 pub closure_upstream: Option<ClosureUpstream>,
158 pub verified_fleet: Arc<RwLock<Option<VerifiedFleetSnapshot>>>,
159 /// Wake signal for `GET /v1/agent/dispatch` long-pollers. The applier
160 /// (via `apply_plan_action::QueueDispatch` /
161 /// `apply_effect::RemoteQueueDispatch`) sends `()` after every
162 /// `dispatch_queue.upsert`. Every parked long-poll wakes and re-checks
163 /// its own host's row; false wakes are negligible — `peek_for_host`
164 /// is a single COUNT(*) against a covered index.
165 pub dispatch_kick: tokio::sync::watch::Sender<()>,
166 pub confirm_deadline_secs: i64,
167 pub rollouts_dir: Option<PathBuf>,
168 pub rollouts_source: Option<crate::rollouts_source::RolloutsSource>,
169 /// Forge URLs + trust path the manifest_poll worker uses to refresh
170 /// the runtime's `SignedManifestSet` cache. Mirrors the legacy
171 /// `channel_refs_poll` config so the two pollers can read identical
172 /// inputs during the 7a → 7c transition (legacy poller dies in 7c
173 /// once the new runtime is end-to-end-verified).
174 pub channel_refs_source: Option<crate::runtime::workers::manifest_poll::ChannelRefsSource>,
175 pub strict: bool,
176 /// See `ServeArgs::agent_cn_suffix`. Captured into AppState so the
177 /// enroll/renew handlers can canonicalise CNs without going
178 /// through `issuance_paths`.
179 pub agent_cn_suffix: String,
180 /// Validity duration baked into agent certs at enroll + renew.
181 /// Default `AGENT_CERT_VALIDITY` (30 days); operators can override
182 /// via `--agent-cert-validity-secs` for short-cycle hardware testing.
183 pub agent_cert_validity: Duration,
184 /// Set to `true` once the channel-refs poll (or build-time prime)
185 /// has populated `verified_fleet` with a freshly-verified snapshot.
186 /// Stays `false` indefinitely when neither prime path produces a
187 /// verifiable artifact (operator must provision `artifact_path` or
188 /// configure `channel_refs.artifact_url`). Read by the
189 /// `require_ready` middleware to gate `/v1/*` with 503 until set.
190 pub artifact_primed: Arc<AtomicBool>,
191 /// Set to `true` once the revocations poll has applied a verified
192 /// list at least once. Only consulted when `revocations_required`
193 /// is `true`; otherwise the readiness check ignores this flag.
194 pub revocations_primed: Arc<AtomicBool>,
195 /// `true` iff `--revocations-{artifact,signature}-url` were both set
196 /// at startup. Captured into AppState so the readiness check stays
197 /// pure (no need to thread `ServeArgs` into middleware).
198 pub revocations_required: bool,
199 /// In-memory bootstrap-nonces allowlist. Replaced wholesale by the
200 /// `bootstrap_nonces_poll` task per successful verify. Read by the
201 /// `/v1/enroll` handler under a read-lock.
202 pub allowed_nonces: Arc<RwLock<crate::db::allowed_nonces::AllowedNoncesView>>,
203 /// Set to `true` once the bootstrap-nonces poll has applied a verified
204 /// allowlist at least once. Only consulted when
205 /// `bootstrap_nonces_required` is `true`.
206 pub bootstrap_nonces_primed: Arc<AtomicBool>,
207 /// `true` iff `--bootstrap-nonces-{artifact,signature}-url` were both
208 /// set at startup. Captured into AppState so the readiness check stays
209 /// pure (no need to thread `ServeArgs` into middleware).
210 pub bootstrap_nonces_required: bool,
211
212 /// Reducer-task input channel sender. Populated by `serve()` once
213 /// `runtime::spawn` returns; the new `/v1/agent/{events,heartbeat,dispatch}`
214 /// route handlers read it to push `ReducerInput` values to the reducer
215 /// without blocking on AppState locks. `None` ⇒ 503 (runtime not yet
216 /// spun up — only observable in a narrow startup window before
217 /// `serve()` wires it).
218 pub runtime_input_tx: OnceLock<mpsc::Sender<crate::runtime::ReducerInput>>,
219 /// Cloneable sender on the bounded event_log writer channel. Same
220 /// lifecycle as `runtime_input_tx`. Routes use this for inbound-event
221 /// audit-log appends when they want immediate persistence (instead of
222 /// routing through the reducer's applier).
223 pub runtime_event_log_tx: OnceLock<crate::runtime::EventLogTx>,
224}
225
226impl AppState {
227 /// Composite readiness: artifact verified AND (when configured)
228 /// revocations verified. Strict: full trust footprint loaded
229 /// before serving agents. See #95.
230 pub fn is_ready(&self) -> bool {
231 if !self.artifact_primed.load(Ordering::Acquire) {
232 return false;
233 }
234 if self.revocations_required && !self.revocations_primed.load(Ordering::Acquire) {
235 return false;
236 }
237 if self.bootstrap_nonces_required && !self.bootstrap_nonces_primed.load(Ordering::Acquire) {
238 return false;
239 }
240 true
241 }
242}
243
244impl Default for AppState {
245 fn default() -> Self {
246 Self {
247 last_tick_at: RwLock::new(None),
248 issuance_paths: RwLock::new(IssuancePaths::default()),
249 ca_signer: RwLock::new(None),
250 db: None,
251 closure_upstream: None,
252 verified_fleet: Arc::new(RwLock::new(None)),
253 dispatch_kick: tokio::sync::watch::channel(()).0,
254 confirm_deadline_secs: DEFAULT_CONFIRM_DEADLINE_SECS,
255 rollouts_dir: None,
256 rollouts_source: None,
257 channel_refs_source: None,
258 strict: false,
259 agent_cn_suffix: crate::auth::issuance::DEFAULT_AGENT_CN_SUFFIX.to_string(),
260 agent_cert_validity: crate::auth::issuance::AGENT_CERT_VALIDITY,
261 artifact_primed: Arc::new(AtomicBool::new(false)),
262 revocations_primed: Arc::new(AtomicBool::new(false)),
263 revocations_required: false,
264 allowed_nonces: Arc::new(RwLock::new(
265 crate::db::allowed_nonces::AllowedNoncesView::default(),
266 )),
267 bootstrap_nonces_primed: Arc::new(AtomicBool::new(false)),
268 bootstrap_nonces_required: false,
269 runtime_input_tx: OnceLock::new(),
270 runtime_event_log_tx: OnceLock::new(),
271 }
272 }
273}
274
275impl std::fmt::Debug for AppState {
276 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
277 f.debug_struct("AppState")
278 .field("db", &self.db.is_some())
279 .finish_non_exhaustive()
280 }
281}
282
283#[cfg(test)]
284mod ready_tests {
285 use super::*;
286
287 /// Default fresh AppState - neither artifact nor revocations primed.
288 /// `is_ready` must return false so the middleware can hold /v1/* with 503.
289 #[test]
290 fn fresh_state_is_not_ready() {
291 let state = AppState::default();
292 assert!(!state.is_ready(), "fresh state must not be ready");
293 }
294
295 /// `revocations_required = false` (no `--revocations-*-url` flags).
296 /// Only the artifact prime gates readiness - revocations_primed is ignored.
297 #[test]
298 fn artifact_prime_alone_is_enough_when_revocations_not_required() {
299 let state = AppState::default();
300 state.artifact_primed.store(true, Ordering::Release);
301 assert!(
302 state.is_ready(),
303 "artifact-only ready when revocations not required"
304 );
305 }
306
307 /// `revocations_required = true` (operator configured the polling loop).
308 /// Artifact prime alone must NOT flip ready - full trust footprint required.
309 #[test]
310 fn artifact_alone_is_not_enough_when_revocations_required() {
311 let state = AppState {
312 revocations_required: true,
313 ..AppState::default()
314 };
315 state.artifact_primed.store(true, Ordering::Release);
316 assert!(
317 !state.is_ready(),
318 "must not be ready until revocations also primed",
319 );
320 }
321
322 /// Both flags set in the required configuration -> ready.
323 #[test]
324 fn both_primed_with_revocations_required_is_ready() {
325 let state = AppState {
326 revocations_required: true,
327 ..AppState::default()
328 };
329 state.artifact_primed.store(true, Ordering::Release);
330 state.revocations_primed.store(true, Ordering::Release);
331 assert!(state.is_ready(), "both primed must flip ready");
332 }
333
334 /// Revocations primed but artifact not -> not ready (can't serve dispatch
335 /// without a verified fleet snapshot, even if the revocation list loaded).
336 #[test]
337 fn revocations_alone_is_not_ready() {
338 let state = AppState {
339 revocations_required: true,
340 ..AppState::default()
341 };
342 state.revocations_primed.store(true, Ordering::Release);
343 assert!(
344 !state.is_ready(),
345 "revocations without artifact is not ready"
346 );
347 }
348
349 #[test]
350 fn bootstrap_nonces_required_blocks_ready_until_primed() {
351 let state = AppState {
352 bootstrap_nonces_required: true,
353 ..AppState::default()
354 };
355 state.artifact_primed.store(true, Ordering::Release);
356 assert!(
357 !state.is_ready(),
358 "must not be ready until bootstrap_nonces also primed",
359 );
360 state.bootstrap_nonces_primed.store(true, Ordering::Release);
361 assert!(
362 state.is_ready(),
363 "ready once bootstrap_nonces primed alongside artifact",
364 );
365 }
366}