nixfleet_agent/runtime/workers/probe_runners/
mod.rs

1//! Per-kind probe runners (RFC-0007 §3.1). Each runner consumes a
2//! `ProbeDecl` + returns a `RunnerOutcome`. Uniform strict-mode
3//! semantics: any runtime error → `ProbeStatus::Fail` with a
4//! `failure_reason` string. Per RFC-0007 §6 there is no `Unknown` or
5//! "swallowed error" class.
6//!
7//! Runners are pure (modulo I/O and the system clock) — they don't
8//! emit events; the probe worker handles event emission + state
9//! tracking. Each runner is `Send + 'static` so it can be `tokio::spawn`'d.
10
11use std::collections::HashMap;
12
13use chrono::{DateTime, Utc};
14use nixfleet_state_machine::{ProbeMode, ProbeStatus, ProbeSubResult};
15use serde::{Deserialize, Serialize};
16
17pub mod evidence;
18pub mod exec;
19pub mod http;
20pub mod tcp;
21
22/// LOADBEARING: floor on probe interval guards against a misconfigured
23/// 0/1-second probe DOSing the host. Operator-declared
24/// `intervalSeconds` values below this are rounded up at the worker
25/// layer (`crate::runtime::workers::probe::spawn` clamps via
26/// `interval_seconds.max(MIN_INTERVAL_SECS)`). A weaker `.max(1)`
27/// floor would still let a 1-second HTTP probe issue 60 reqs/min
28/// against an operator-unintended backend.
29pub const MIN_INTERVAL_SECS: u64 = 5;
30
31/// LOADBEARING: per-failure cap on `failure_reason` string length keeps
32/// the wire body bounded. Without truncation, runners can emit
33/// arbitrarily long stderr / response bodies that inflate the outbound
34/// queue's JSON payloads and event-log row sizes. Runners pass their
35/// failure-reason strings through [`truncate_reason`] before
36/// constructing a `RunnerOutcome::Fail`.
37pub const FAILURE_REASON_MAX_LEN: usize = 512;
38
39/// Truncate to `FAILURE_REASON_MAX_LEN` chars; appends `"...[truncated]"`
40/// when truncation fires. UTF-8 safe: bumps `end` back to the prior
41/// char boundary if a multibyte sequence would be split.
42pub fn truncate_reason(s: String) -> String {
43    if s.len() > FAILURE_REASON_MAX_LEN {
44        let mut end = FAILURE_REASON_MAX_LEN;
45        while !s.is_char_boundary(end) {
46            end -= 1;
47        }
48        format!("{}...[truncated]", &s[..end])
49    } else {
50        s
51    }
52}
53
54/// On-disk probe declaration. Loaded from
55/// `/etc/nixfleet/agent/health-checks.json` (rendered from
56/// `lib/mk-fleet.nix:effectiveHealthChecks` by `_agent.nix`).
57#[derive(Debug, Clone, Deserialize, Serialize)]
58#[serde(rename_all = "camelCase")]
59pub struct ProbeDecl {
60    pub kind: String, // "http" | "tcp" | "exec" | "evidence"
61    pub mode: String, // "enforce" | "observe" | "disabled"
62    #[serde(default = "default_interval_seconds")]
63    pub interval_seconds: u64,
64    #[serde(default)]
65    pub run_once: bool,
66    // kind-specific (all optional; runner validates what it needs)
67    #[serde(default)]
68    pub url: Option<String>,
69    #[serde(default = "default_expect_status")]
70    pub expect_status: u16,
71    #[serde(default)]
72    pub host: Option<String>,
73    #[serde(default)]
74    pub port: Option<u16>,
75    #[serde(default = "default_connect_timeout_secs")]
76    pub connect_timeout_secs: u64,
77    #[serde(default)]
78    pub command: Vec<String>,
79    #[serde(default = "default_timeout_secs")]
80    pub timeout_secs: u64,
81    #[serde(default)]
82    pub framework: Option<String>,
83    #[serde(default = "default_evidence_path")]
84    pub evidence_path: String,
85    /// Per-control mode overrides on top of `mode`, scoped to the
86    /// framework declared in `framework`. Resolved per-control at
87    /// runtime: override > probe-level mode.
88    #[serde(default)]
89    pub control_overrides: HashMap<String, ControlOverrideDecl>,
90    /// Explicit per-control selection (custom-framework declaration).
91    /// Mutually exclusive with `framework`; eval-time validation in
92    /// `lib/mk-fleet.nix` rejects probes that set both.
93    #[serde(default)]
94    pub controls: HashMap<String, ControlOverrideDecl>,
95}
96
97/// Single entry in `controlOverrides` / `controls` (RFC-0007 §3.4
98/// per-control granularity). `mode` is the effective mode for the
99/// control; `reason` is operator-facing audit rationale, surfaced in
100/// event_log + dashboards.
101#[derive(Debug, Clone, Deserialize, Serialize)]
102#[serde(rename_all = "camelCase")]
103pub struct ControlOverrideDecl {
104    pub mode: String,
105    #[serde(default)]
106    pub reason: String,
107}
108
109impl ControlOverrideDecl {
110    pub fn resolved_mode(&self) -> ProbeMode {
111        match self.mode.as_str() {
112            "observe" => ProbeMode::Observe,
113            "disabled" => ProbeMode::Disabled,
114            _ => ProbeMode::Enforce,
115        }
116    }
117}
118
119fn default_interval_seconds() -> u64 {
120    30
121}
122fn default_expect_status() -> u16 {
123    200
124}
125fn default_connect_timeout_secs() -> u64 {
126    5
127}
128fn default_timeout_secs() -> u64 {
129    10
130}
131fn default_evidence_path() -> String {
132    "/var/lib/nixfleet-compliance/evidence.json".to_string()
133}
134
135/// Output of one runner invocation.
136#[derive(Debug, Clone)]
137pub struct RunnerOutcome {
138    pub status: ProbeStatus,
139    pub observed_at: DateTime<Utc>,
140    pub failure_reason: Option<String>,
141    /// `None` for non-evidence kinds; `Some(vec)` for evidence runner.
142    pub sub_results: Option<Vec<ProbeSubResult>>,
143}
144
145impl RunnerOutcome {
146    pub fn pass(observed_at: DateTime<Utc>) -> Self {
147        Self {
148            status: ProbeStatus::Pass,
149            observed_at,
150            failure_reason: None,
151            sub_results: None,
152        }
153    }
154
155    pub fn fail(observed_at: DateTime<Utc>, reason: impl Into<String>) -> Self {
156        Self {
157            status: ProbeStatus::Fail,
158            observed_at,
159            // Truncate at construction time so every runner gets the
160            // FAILURE_REASON_MAX_LEN cap via the type funnel
161            // (defense-in-depth — runners can't accidentally bypass).
162            failure_reason: Some(truncate_reason(reason.into())),
163            sub_results: None,
164        }
165    }
166}
167
168/// Dispatch on `decl.kind`. Unknown kinds fail closed.
169pub async fn run(decl: &ProbeDecl, now: DateTime<Utc>) -> RunnerOutcome {
170    match decl.kind.as_str() {
171        "http" => http::run(decl, now).await,
172        "tcp" => tcp::run(decl, now).await,
173        "exec" => exec::run(decl, now).await,
174        "evidence" => evidence::run(decl, now).await,
175        other => RunnerOutcome::fail(now, format!("unknown probe kind '{other}'")),
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use chrono::TimeZone;
183
184    fn t0() -> DateTime<Utc> {
185        Utc.with_ymd_and_hms(2026, 1, 1, 12, 0, 0).unwrap()
186    }
187
188    #[test]
189    fn truncate_reason_passes_through_short_strings() {
190        let short = "503 Service Unavailable".to_string();
191        assert_eq!(truncate_reason(short.clone()), short);
192    }
193
194    #[test]
195    fn truncate_reason_caps_at_max_len() {
196        let long = "x".repeat(FAILURE_REASON_MAX_LEN + 100);
197        let truncated = truncate_reason(long);
198        assert!(truncated.len() <= FAILURE_REASON_MAX_LEN + "...[truncated]".len());
199        assert!(truncated.ends_with("...[truncated]"));
200    }
201
202    #[test]
203    fn truncate_reason_exact_max_len_is_passthrough() {
204        let exact = "x".repeat(FAILURE_REASON_MAX_LEN);
205        assert_eq!(truncate_reason(exact.clone()).len(), FAILURE_REASON_MAX_LEN);
206    }
207
208    #[test]
209    fn truncate_reason_handles_utf8_boundary() {
210        // Build a string where byte `FAILURE_REASON_MAX_LEN` lands inside
211        // a multibyte UTF-8 sequence. `truncate_reason` must back up to
212        // a char boundary instead of slicing mid-codepoint.
213        let prefix_len = FAILURE_REASON_MAX_LEN - 1;
214        let mut s = "a".repeat(prefix_len);
215        s.push('é'); // 2 bytes; first byte at index prefix_len, second at prefix_len+1
216        s.push_str("trailing");
217        assert!(s.len() > FAILURE_REASON_MAX_LEN);
218        let truncated = truncate_reason(s);
219        // Must not panic on multibyte slice, must end with "...[truncated]"
220        assert!(truncated.ends_with("...[truncated]"));
221    }
222
223    #[test]
224    fn fail_outcome_applies_truncation_at_construction() {
225        // Regression guard: every RunnerOutcome::fail call routes
226        // through truncate_reason so runners can produce arbitrary
227        // failure-reason content (stderr tails, response bodies,
228        // exception traces) without wire-size amplification.
229        let huge = "fail reason ".repeat(200); // ~2400 chars
230        let outcome = RunnerOutcome::fail(t0(), huge);
231        let reason = outcome.failure_reason.expect("failure_reason set");
232        assert!(reason.len() <= FAILURE_REASON_MAX_LEN + "...[truncated]".len());
233        assert!(reason.ends_with("...[truncated]"));
234    }
235
236    #[test]
237    fn min_interval_secs_is_5() {
238        // Pin the constant against drift. The 5-second floor is the
239        // documented operator-DOS-protection baseline.
240        assert_eq!(MIN_INTERVAL_SECS, 5);
241    }
242}