nixfleet_control_plane/server/
state.rs

1//! Shared state + configuration types for the long-running server.
2
3use std::net::SocketAddr;
4use std::path::PathBuf;
5use std::sync::Arc;
6use std::sync::OnceLock;
7use std::sync::atomic::{AtomicBool, Ordering};
8use std::time::Duration;
9
10use chrono::{DateTime, Utc};
11use nixfleet_proto::FleetResolved;
12use tokio::sync::{RwLock, mpsc};
13
14/// Must exceed agent poll budget (~300s) plus slack to avoid magic-rollback / agent-poll races.
15pub const DEFAULT_CONFIRM_DEADLINE_SECS: i64 = 360;
16
17/// `Default` defaults are bogus on purpose; prod paths fail at first IO if
18/// clap parsing is skipped.
19#[derive(Debug, Clone)]
20pub struct ServeArgs {
21    pub listen: SocketAddr,
22    pub tls_cert: PathBuf,
23    pub tls_key: PathBuf,
24    pub client_ca: Option<PathBuf>,
25    /// Often the same path as `client_ca`.
26    pub fleet_ca_cert: Option<PathBuf>,
27    /// File-backed CA signer's private key PEM. TPM (pubkey + wrapper) wins.
28    pub fleet_ca_key: Option<PathBuf>,
29    /// TPM-backed CA signer: keyslot scope's `pubkey.raw` (64 raw P-256 X||Y).
30    pub tpm_ca_pubkey_raw: Option<PathBuf>,
31    /// TPM-backed CA signer: keyslot scope's `tpm-sign-<keyname>` wrapper.
32    pub tpm_ca_sign_wrapper: Option<PathBuf>,
33    /// Permit the file-backed CA-issuance backend under `--strict`. Default
34    /// `false`: in strict mode, the file backend is refused unless TPM is
35    /// also configured (in which case TPM wins) or this flag is set
36    /// explicitly. See RFC-0010 §1.5.1.
37    pub allow_file_ca_key: bool,
38    pub audit_log_path: Option<PathBuf>,
39    pub artifact_path: PathBuf,
40    pub signature_path: PathBuf,
41    pub trust_path: PathBuf,
42    /// File-backed fallback used only when no agents checked in AND `channel_refs` is None.
43    pub observed_path: PathBuf,
44    pub freshness_window: Duration,
45    pub confirm_deadline_secs: i64,
46    /// `None` -> file-backed `--artifact` only.
47    pub channel_refs: Option<crate::runtime::workers::manifest_poll::ChannelRefsSource>,
48    pub revocations: Option<crate::polling::revocations_poll::RevocationsSource>,
49    pub bootstrap_nonces: Option<crate::polling::bootstrap_nonces_poll::BootstrapNoncesSource>,
50    /// `None` -> in-memory state only.
51    pub db_path: Option<PathBuf>,
52    /// `None` -> `/v1/agent/closure/<hash>` returns 501.
53    pub closure_upstream: Option<String>,
54    /// Pre-signed `<rolloutId>.{json,sig}` pairs; falls back to `rollouts_source`, then 503.
55    pub rollouts_dir: Option<PathBuf>,
56    /// HTTP-fetched manifests; required when `nixfleet-release` writes manifests post-build.
57    pub rollouts_source: Option<crate::rollouts_source::RolloutsSource>,
58    /// Refuse to start when any security-fallback flag is unset.
59    pub strict: bool,
60    /// `agent-<machineId>.<suffix>` for issued cert CNs. Must match the
61    /// issuance CA's `dNSName` name constraint (D14).
62    pub agent_cn_suffix: String,
63    /// Validity baked into agent certs at enroll + renew. Default 30d;
64    /// shortened by operators for hardware testing of renewal flows.
65    pub agent_cert_validity: Duration,
66    /// Test-only: skip the readiness gate so endpoint tests don't have to
67    /// drive a real channel-refs poll. Production paths MUST leave `false`;
68    /// the CLI never sets it.
69    pub mark_ready_at_startup: bool,
70    /// Test-only: seed the in-memory bootstrap-nonces allowlist at startup
71    /// without running the poll loop. Production paths MUST leave `None`;
72    /// the CLI never sets it.
73    pub initial_nonces: Option<crate::db::allowed_nonces::AllowedNoncesView>,
74}
75
76impl Default for ServeArgs {
77    fn default() -> Self {
78        Self {
79            listen: "127.0.0.1:0".parse().expect("static loopback addr"),
80            tls_cert: PathBuf::new(),
81            tls_key: PathBuf::new(),
82            client_ca: None,
83            fleet_ca_cert: None,
84            fleet_ca_key: None,
85            tpm_ca_pubkey_raw: None,
86            tpm_ca_sign_wrapper: None,
87            allow_file_ca_key: false,
88            audit_log_path: None,
89            artifact_path: PathBuf::new(),
90            signature_path: PathBuf::new(),
91            trust_path: PathBuf::new(),
92            observed_path: PathBuf::new(),
93            freshness_window: Duration::from_secs(2_592_000),
94            confirm_deadline_secs: DEFAULT_CONFIRM_DEADLINE_SECS,
95            channel_refs: None,
96            revocations: None,
97            bootstrap_nonces: None,
98            db_path: None,
99            closure_upstream: None,
100            rollouts_dir: None,
101            rollouts_source: None,
102            strict: false,
103            agent_cn_suffix: crate::auth::issuance::DEFAULT_AGENT_CN_SUFFIX.to_string(),
104            agent_cert_validity: crate::auth::issuance::AGENT_CERT_VALIDITY,
105            mark_ready_at_startup: false,
106            initial_nonces: None,
107        }
108    }
109}
110
111/// `(fleet, hash, raw_bytes)` tuple under one lock prevents readers
112/// seeing fresh fleet with stale hash or stale bytes. `artifact_bytes` +
113/// `signature_bytes` are the canonical signed bytes the
114/// `cp_manifest_poll` worker fetched + verified; the
115/// `/v1/fleet.resolved` route serves them directly so agents see the
116/// exact bytes CP verified, not stale closure-embedded content from the
117/// `--artifact` flag.
118#[derive(Clone, Debug)]
119pub struct VerifiedFleetSnapshot {
120    pub fleet: Arc<FleetResolved>,
121    pub fleet_resolved_hash: String,
122    pub artifact_bytes: Vec<u8>,
123    pub signature_bytes: Vec<u8>,
124}
125
126#[derive(Clone, Debug)]
127pub struct ClosureUpstream {
128    pub base_url: String,
129    pub client: reqwest::Client,
130}
131
132#[derive(Debug, Clone, Default)]
133pub struct IssuancePaths {
134    pub fleet_ca_cert: Option<PathBuf>,
135    pub fleet_ca_key: Option<PathBuf>,
136    pub audit_log: Option<PathBuf>,
137    /// Path to the daemon-configured trust.json (the `--trust-file`
138    /// flag). Both `/v1/enroll` (orgRootKey signature verify) and
139    /// `/v1/agent/bootstrap-report` (same) read this. The polling
140    /// loops have their own copy via `ChannelRefsSource.trust_path`
141    /// - they all point at the same file in production. ONE source
142    /// of truth for the daemon's trust roots, not derived from
143    /// fleet_ca_cert (which broke when operators placed
144    /// fleet-ca.pem outside `/etc/nixfleet/cp/`).
145    pub trust_path: PathBuf,
146}
147
148pub struct AppState {
149    pub last_tick_at: RwLock<Option<DateTime<Utc>>>,
150    pub issuance_paths: RwLock<IssuancePaths>,
151    /// Built once at server start from `ServeArgs` - `TpmCaSigner` if
152    /// the TPM flags are set, `FileCaSigner` otherwise, `None` if no
153    /// CA flags supplied (enroll/renew return 500). `dyn` lets enroll
154    /// + renew handlers stay agnostic to signing backend.
155    pub ca_signer: RwLock<Option<Arc<dyn crate::auth::issuance::CaSigner>>>,
156    pub db: Option<Arc<crate::db::Db>>,
157    pub closure_upstream: Option<ClosureUpstream>,
158    pub verified_fleet: Arc<RwLock<Option<VerifiedFleetSnapshot>>>,
159    /// Wake signal for `GET /v1/agent/dispatch` long-pollers. The applier
160    /// (via `apply_plan_action::QueueDispatch` /
161    /// `apply_effect::RemoteQueueDispatch`) sends `()` after every
162    /// `dispatch_queue.upsert`. Every parked long-poll wakes and re-checks
163    /// its own host's row; false wakes are negligible — `peek_for_host`
164    /// is a single COUNT(*) against a covered index.
165    pub dispatch_kick: tokio::sync::watch::Sender<()>,
166    pub confirm_deadline_secs: i64,
167    pub rollouts_dir: Option<PathBuf>,
168    pub rollouts_source: Option<crate::rollouts_source::RolloutsSource>,
169    /// Forge URLs + trust path the manifest_poll worker uses to refresh
170    /// the runtime's `SignedManifestSet` cache. Mirrors the legacy
171    /// `channel_refs_poll` config so the two pollers can read identical
172    /// inputs during the 7a → 7c transition (legacy poller dies in 7c
173    /// once the new runtime is end-to-end-verified).
174    pub channel_refs_source: Option<crate::runtime::workers::manifest_poll::ChannelRefsSource>,
175    pub strict: bool,
176    /// See `ServeArgs::agent_cn_suffix`. Captured into AppState so the
177    /// enroll/renew handlers can canonicalise CNs without going
178    /// through `issuance_paths`.
179    pub agent_cn_suffix: String,
180    /// Validity duration baked into agent certs at enroll + renew.
181    /// Default `AGENT_CERT_VALIDITY` (30 days); operators can override
182    /// via `--agent-cert-validity-secs` for short-cycle hardware testing.
183    pub agent_cert_validity: Duration,
184    /// Set to `true` once the channel-refs poll (or build-time prime)
185    /// has populated `verified_fleet` with a freshly-verified snapshot.
186    /// Stays `false` indefinitely when neither prime path produces a
187    /// verifiable artifact (operator must provision `artifact_path` or
188    /// configure `channel_refs.artifact_url`). Read by the
189    /// `require_ready` middleware to gate `/v1/*` with 503 until set.
190    pub artifact_primed: Arc<AtomicBool>,
191    /// Set to `true` once the revocations poll has applied a verified
192    /// list at least once. Only consulted when `revocations_required`
193    /// is `true`; otherwise the readiness check ignores this flag.
194    pub revocations_primed: Arc<AtomicBool>,
195    /// `true` iff `--revocations-{artifact,signature}-url` were both set
196    /// at startup. Captured into AppState so the readiness check stays
197    /// pure (no need to thread `ServeArgs` into middleware).
198    pub revocations_required: bool,
199    /// In-memory bootstrap-nonces allowlist. Replaced wholesale by the
200    /// `bootstrap_nonces_poll` task per successful verify. Read by the
201    /// `/v1/enroll` handler under a read-lock.
202    pub allowed_nonces: Arc<RwLock<crate::db::allowed_nonces::AllowedNoncesView>>,
203    /// Set to `true` once the bootstrap-nonces poll has applied a verified
204    /// allowlist at least once. Only consulted when
205    /// `bootstrap_nonces_required` is `true`.
206    pub bootstrap_nonces_primed: Arc<AtomicBool>,
207    /// `true` iff `--bootstrap-nonces-{artifact,signature}-url` were both
208    /// set at startup. Captured into AppState so the readiness check stays
209    /// pure (no need to thread `ServeArgs` into middleware).
210    pub bootstrap_nonces_required: bool,
211
212    /// Reducer-task input channel sender. Populated by `serve()` once
213    /// `runtime::spawn` returns; the new `/v1/agent/{events,heartbeat,dispatch}`
214    /// route handlers read it to push `ReducerInput` values to the reducer
215    /// without blocking on AppState locks. `None` ⇒ 503 (runtime not yet
216    /// spun up — only observable in a narrow startup window before
217    /// `serve()` wires it).
218    pub runtime_input_tx: OnceLock<mpsc::Sender<crate::runtime::ReducerInput>>,
219    /// Cloneable sender on the bounded event_log writer channel. Same
220    /// lifecycle as `runtime_input_tx`. Routes use this for inbound-event
221    /// audit-log appends when they want immediate persistence (instead of
222    /// routing through the reducer's applier).
223    pub runtime_event_log_tx: OnceLock<crate::runtime::EventLogTx>,
224}
225
226impl AppState {
227    /// Composite readiness: artifact verified AND (when configured)
228    /// revocations verified. Strict: full trust footprint loaded
229    /// before serving agents. See #95.
230    pub fn is_ready(&self) -> bool {
231        if !self.artifact_primed.load(Ordering::Acquire) {
232            return false;
233        }
234        if self.revocations_required && !self.revocations_primed.load(Ordering::Acquire) {
235            return false;
236        }
237        if self.bootstrap_nonces_required && !self.bootstrap_nonces_primed.load(Ordering::Acquire) {
238            return false;
239        }
240        true
241    }
242}
243
244impl Default for AppState {
245    fn default() -> Self {
246        Self {
247            last_tick_at: RwLock::new(None),
248            issuance_paths: RwLock::new(IssuancePaths::default()),
249            ca_signer: RwLock::new(None),
250            db: None,
251            closure_upstream: None,
252            verified_fleet: Arc::new(RwLock::new(None)),
253            dispatch_kick: tokio::sync::watch::channel(()).0,
254            confirm_deadline_secs: DEFAULT_CONFIRM_DEADLINE_SECS,
255            rollouts_dir: None,
256            rollouts_source: None,
257            channel_refs_source: None,
258            strict: false,
259            agent_cn_suffix: crate::auth::issuance::DEFAULT_AGENT_CN_SUFFIX.to_string(),
260            agent_cert_validity: crate::auth::issuance::AGENT_CERT_VALIDITY,
261            artifact_primed: Arc::new(AtomicBool::new(false)),
262            revocations_primed: Arc::new(AtomicBool::new(false)),
263            revocations_required: false,
264            allowed_nonces: Arc::new(RwLock::new(
265                crate::db::allowed_nonces::AllowedNoncesView::default(),
266            )),
267            bootstrap_nonces_primed: Arc::new(AtomicBool::new(false)),
268            bootstrap_nonces_required: false,
269            runtime_input_tx: OnceLock::new(),
270            runtime_event_log_tx: OnceLock::new(),
271        }
272    }
273}
274
275impl std::fmt::Debug for AppState {
276    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
277        f.debug_struct("AppState")
278            .field("db", &self.db.is_some())
279            .finish_non_exhaustive()
280    }
281}
282
283#[cfg(test)]
284mod ready_tests {
285    use super::*;
286
287    /// Default fresh AppState - neither artifact nor revocations primed.
288    /// `is_ready` must return false so the middleware can hold /v1/* with 503.
289    #[test]
290    fn fresh_state_is_not_ready() {
291        let state = AppState::default();
292        assert!(!state.is_ready(), "fresh state must not be ready");
293    }
294
295    /// `revocations_required = false` (no `--revocations-*-url` flags).
296    /// Only the artifact prime gates readiness - revocations_primed is ignored.
297    #[test]
298    fn artifact_prime_alone_is_enough_when_revocations_not_required() {
299        let state = AppState::default();
300        state.artifact_primed.store(true, Ordering::Release);
301        assert!(
302            state.is_ready(),
303            "artifact-only ready when revocations not required"
304        );
305    }
306
307    /// `revocations_required = true` (operator configured the polling loop).
308    /// Artifact prime alone must NOT flip ready - full trust footprint required.
309    #[test]
310    fn artifact_alone_is_not_enough_when_revocations_required() {
311        let state = AppState {
312            revocations_required: true,
313            ..AppState::default()
314        };
315        state.artifact_primed.store(true, Ordering::Release);
316        assert!(
317            !state.is_ready(),
318            "must not be ready until revocations also primed",
319        );
320    }
321
322    /// Both flags set in the required configuration -> ready.
323    #[test]
324    fn both_primed_with_revocations_required_is_ready() {
325        let state = AppState {
326            revocations_required: true,
327            ..AppState::default()
328        };
329        state.artifact_primed.store(true, Ordering::Release);
330        state.revocations_primed.store(true, Ordering::Release);
331        assert!(state.is_ready(), "both primed must flip ready");
332    }
333
334    /// Revocations primed but artifact not -> not ready (can't serve dispatch
335    /// without a verified fleet snapshot, even if the revocation list loaded).
336    #[test]
337    fn revocations_alone_is_not_ready() {
338        let state = AppState {
339            revocations_required: true,
340            ..AppState::default()
341        };
342        state.revocations_primed.store(true, Ordering::Release);
343        assert!(
344            !state.is_ready(),
345            "revocations without artifact is not ready"
346        );
347    }
348
349    #[test]
350    fn bootstrap_nonces_required_blocks_ready_until_primed() {
351        let state = AppState {
352            bootstrap_nonces_required: true,
353            ..AppState::default()
354        };
355        state.artifact_primed.store(true, Ordering::Release);
356        assert!(
357            !state.is_ready(),
358            "must not be ready until bootstrap_nonces also primed",
359        );
360        state.bootstrap_nonces_primed.store(true, Ordering::Release);
361        assert!(
362            state.is_ready(),
363            "ready once bootstrap_nonces primed alongside artifact",
364        );
365    }
366}