nixfleet_agent/activation/
linux.rs

1//! Linux (NixOS) activation primitives. fire_* uses `systemd-run --unit=...`
2//! so the agent's SIGTERM can't kill the activation mid-run.
3
4use std::path::Path;
5
6use super::types::ActivationTarget;
7use anyhow::{Context, Result};
8use tokio::process::Command;
9
10use super::{ActivationBackend, ActivationOutcome, RollbackOutcome};
11
12#[derive(Clone, Copy, Debug, Default)]
13pub struct LinuxBackend;
14
15impl ActivationBackend for LinuxBackend {
16    async fn is_switch_in_progress(&self) -> bool {
17        is_switch_in_progress().await
18    }
19    async fn read_unit_exit_code(&self, unit_name: &str) -> Option<i32> {
20        read_unit_exit_code(unit_name).await
21    }
22    async fn fire_switch(
23        &self,
24        target: &ActivationTarget,
25        store_path: &str,
26    ) -> Result<Option<ActivationOutcome>> {
27        fire_switch(target, store_path).await
28    }
29    async fn fire_rollback(&self, target_basename: &str) -> Result<Option<RollbackOutcome>> {
30        fire_rollback(target_basename).await
31    }
32}
33
34const SWITCH_LOCK_PATH: &str = "/run/nixos/switch-to-configuration.lock";
35
36/// Fail-open: absent lock file or missing flock binary -> false.
37async fn is_switch_in_progress() -> bool {
38    is_switch_in_progress_at(Path::new(SWITCH_LOCK_PATH)).await
39}
40
41async fn is_switch_in_progress_at(lock_path: &Path) -> bool {
42    if !lock_path.exists() {
43        return false;
44    }
45    let status = Command::new("flock")
46        .arg("--nonblock")
47        .arg("--shared")
48        .arg(lock_path)
49        .arg("true")
50        .status()
51        .await;
52    match status {
53        Ok(s) if s.success() => false,
54        Ok(_) => true,
55        Err(_) => false,
56    }
57}
58
59/// `None` on failure / empty / non-numeric (never synthesise a misleading 0).
60async fn read_unit_exit_code(unit_name: &str) -> Option<i32> {
61    let output = Command::new("systemctl")
62        .arg("show")
63        .arg("--property=ExecMainStatus")
64        .arg("--value")
65        .arg(unit_name)
66        .output()
67        .await
68        .ok()?;
69    if !output.status.success() {
70        return None;
71    }
72    let stdout = String::from_utf8(output.stdout).ok()?;
73    let trimmed = stdout.trim();
74    if trimmed.is_empty() {
75        return None;
76    }
77    trimmed.parse::<i32>().ok()
78}
79
80/// Critical components whose live-swap nixos-rebuild refuses. Detection is
81/// canonicalize-equality on the symlink target between current + new closure.
82/// `init` is NOT listed: it's a regenerated-per-system stub that always
83/// differs across closures regardless of whether anything runtime-relevant
84/// changed; listing it would force a defer on every update. The actually-
85/// unsafe components are systemd (PID 1), kernel, and dbus.
86const SWITCH_INHIBITORS: &[(&str, &str)] = &[
87    // dbus.service is the unit symlink - broker↔dbus swaps surface as a
88    // different canonicalised target inside the new closure.
89    ("dbus", "etc/systemd/system/dbus.service"),
90    ("systemd", "sw/lib/systemd/systemd"),
91    ("kernel", "kernel"),
92];
93
94/// Returns `Some(component)` when a critical-component swap is detected
95/// between the running system and the new closure. Either side missing the
96/// path is out-of-scope (returns `None` for that component) - we only flag
97/// genuine swaps, not absences.
98fn detect_switch_inhibitors(current_system: &Path, new_store_path: &Path) -> Option<&'static str> {
99    for (name, rel_path) in SWITCH_INHIBITORS {
100        let cur = current_system.join(rel_path);
101        let new = new_store_path.join(rel_path);
102        match (std::fs::canonicalize(&cur), std::fs::canonicalize(&new)) {
103            (Ok(c), Ok(n)) if c != n => return Some(name),
104            _ => {}
105        }
106    }
107    None
108}
109
110const CURRENT_SYSTEM_PATH: &str = "/run/current-system";
111
112// FOOTGUN: --scope / --pipe --wait inherit the caller's cgroup; agent
113// SIGTERM would kill the switch mid-run. Use --unit for cgroup isolation.
114async fn fire_switch(
115    target: &ActivationTarget,
116    store_path: &str,
117) -> Result<Option<ActivationOutcome>> {
118    if let Some(component) =
119        detect_switch_inhibitors(Path::new(CURRENT_SYSTEM_PATH), Path::new(store_path))
120    {
121        tracing::warn!(
122            target_closure = %target.closure_hash,
123            component = component,
124            "agent: deferring live switch - critical-component swap requires reboot",
125        );
126        // LOADBEARING: `nix-env --set` creates the generation but does NOT
127        // write bootloader entries - only switch-to-configuration does.
128        // Without `boot` here, the next reboot lands back on the previous
129        // default and the defer-then-reboot lifecycle breaks.
130        let switch_bin = format!("{store_path}/bin/switch-to-configuration");
131        let boot_status = Command::new(&switch_bin)
132            .arg("boot")
133            .status()
134            .await
135            .with_context(|| format!("spawn {switch_bin} boot"))?;
136        if !boot_status.success() {
137            tracing::error!(
138                target_closure = %target.closure_hash,
139                exit_code = ?boot_status.code(),
140                "agent: switch-to-configuration boot failed in defer path; bootloader NOT updated",
141            );
142            return Ok(Some(ActivationOutcome::SwitchFailed {
143                phase: "defer-bootloader-update".to_string(),
144                exit_code: boot_status.code(),
145            }));
146        }
147        return Ok(Some(ActivationOutcome::DeferredPendingReboot {
148            component: component.to_string(),
149        }));
150    }
151
152    let _ = Command::new("systemctl")
153        .arg("reset-failed")
154        .arg("nixfleet-switch.service")
155        .status()
156        .await;
157
158    let switch_bin = format!("{store_path}/bin/switch-to-configuration");
159    tracing::info!(
160        target_closure = %target.closure_hash,
161        "agent: firing switch via systemd-run --unit=nixfleet-switch (detached)",
162    );
163    let fire_status = Command::new("systemd-run")
164        .arg("--unit=nixfleet-switch")
165        .arg("--collect")
166        .arg("--")
167        .arg(&switch_bin)
168        .arg("switch")
169        .status()
170        .await
171        .with_context(|| "spawn systemd-run --unit=nixfleet-switch")?;
172
173    if !fire_status.success() {
174        tracing::error!(
175            target_closure = %target.closure_hash,
176            exit_code = ?fire_status.code(),
177            "agent: systemd-run failed to queue switch unit",
178        );
179        return Ok(Some(ActivationOutcome::SwitchFailed {
180            phase: "systemd-run-fire".to_string(),
181            exit_code: fire_status.code(),
182        }));
183    }
184    Ok(None)
185}
186
187/// LOADBEARING: `target_basename` resolves to the rolled-back closure's
188/// store path, NOT `/run/current-system`. The agent fires rollback while the
189/// failed closure is still current, so its `switch-to-configuration` would
190/// "switch to" itself - a no-op that leaves nginx (or whatever caused the
191/// failure) still down. Use the freshly-flipped profile target's binary.
192async fn fire_rollback(target_basename: &str) -> Result<Option<RollbackOutcome>> {
193    let _ = Command::new("systemctl")
194        .arg("reset-failed")
195        .arg("nixfleet-rollback.service")
196        .status()
197        .await;
198
199    let switch_bin = rollback_switch_bin(target_basename);
200    tracing::info!(
201        target_basename = %target_basename,
202        switch_bin = %switch_bin,
203        "agent: firing rollback via systemd-run --unit=nixfleet-rollback (detached)",
204    );
205    let fire_status = Command::new("systemd-run")
206        .arg("--unit=nixfleet-rollback")
207        .arg("--collect")
208        .arg("--")
209        .arg(&switch_bin)
210        .arg("switch")
211        .status()
212        .await
213        .with_context(|| "spawn systemd-run --unit=nixfleet-rollback")?;
214
215    if !fire_status.success() {
216        tracing::error!(
217            exit_code = ?fire_status.code(),
218            "agent: systemd-run failed to queue rollback unit",
219        );
220        return Ok(Some(RollbackOutcome::Failed {
221            phase: "systemd-run-fire".to_string(),
222            exit_code: fire_status.code(),
223        }));
224    }
225    Ok(None)
226}
227
228fn rollback_switch_bin(target_basename: &str) -> String {
229    format!("/nix/store/{target_basename}/bin/switch-to-configuration")
230}
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235
236    #[tokio::test]
237    async fn is_switch_in_progress_returns_false_when_lock_absent() {
238        let dir = tempfile::tempdir().expect("tempdir");
239        let absent = dir.path().join("does-not-exist.lock");
240        assert!(!is_switch_in_progress_at(&absent).await);
241    }
242
243    #[tokio::test]
244    async fn is_switch_in_progress_returns_false_for_uncontended_lock() {
245        let dir = tempfile::tempdir().expect("tempdir");
246        let lock = dir.path().join("test.lock");
247        std::fs::write(&lock, b"").expect("create lock file");
248        assert!(!is_switch_in_progress_at(&lock).await);
249    }
250
251    #[test]
252    #[allow(clippy::default_constructed_unit_structs)]
253    fn linux_backend_default_is_unit_struct() {
254        let _b: LinuxBackend = LinuxBackend;
255        let _: LinuxBackend = LinuxBackend::default();
256    }
257
258    /// Regression: fire_rollback used to invoke
259    /// `/run/current-system/bin/switch-to-configuration`, which is still the
260    /// failed closure when rollback fires (profile pointer flipped, but
261    /// /run/current-system unchanged until switch-to-configuration completes).
262    /// Use the rolled-back target's own binary.
263    #[test]
264    fn rollback_switch_bin_uses_target_store_path_not_current_system() {
265        let basename = "abc123-nixos-system-web-01-26.05";
266        assert_eq!(
267            rollback_switch_bin(basename),
268            "/nix/store/abc123-nixos-system-web-01-26.05/bin/switch-to-configuration",
269        );
270    }
271
272    /// Build a fake system tree at `root` where each `rel_path` is a symlink
273    /// pointing to a uniquely-named file under `root.targets/`. Same targets
274    /// across two trees -> canonicalize-equal; different `tag` -> unequal.
275    fn make_fake_system(root: &Path, rel_paths: &[&str], tag: &str) {
276        let targets_dir = root.join(format!("targets-{tag}"));
277        std::fs::create_dir_all(&targets_dir).unwrap();
278        for rel in rel_paths {
279            let target = targets_dir.join(rel.replace('/', "_"));
280            std::fs::write(&target, b"").unwrap();
281            let link = root.join(rel);
282            if let Some(parent) = link.parent() {
283                std::fs::create_dir_all(parent).unwrap();
284            }
285            std::os::unix::fs::symlink(&target, &link).unwrap();
286        }
287    }
288
289    fn share_targets(src_root: &Path, dst_root: &Path, rel_paths: &[&str]) {
290        // Make dst symlinks resolve to the same canonical paths as src - the
291        // identical-systems case.
292        for rel in rel_paths {
293            let src_link = src_root.join(rel);
294            let canonical = std::fs::canonicalize(&src_link).unwrap();
295            let dst_link = dst_root.join(rel);
296            if let Some(parent) = dst_link.parent() {
297                std::fs::create_dir_all(parent).unwrap();
298            }
299            std::os::unix::fs::symlink(&canonical, &dst_link).unwrap();
300        }
301    }
302
303    #[test]
304    fn detect_returns_none_when_systems_are_identical() {
305        let dir = tempfile::tempdir().unwrap();
306        let cur = dir.path().join("current");
307        let new = dir.path().join("new");
308        let rels: Vec<&str> = SWITCH_INHIBITORS.iter().map(|(_, p)| *p).collect();
309        make_fake_system(&cur, &rels, "shared");
310        share_targets(&cur, &new, &rels);
311        assert_eq!(detect_switch_inhibitors(&cur, &new), None);
312    }
313
314    #[test]
315    fn detect_returns_dbus_when_dbus_service_target_differs() {
316        let dir = tempfile::tempdir().unwrap();
317        let cur = dir.path().join("current");
318        let new = dir.path().join("new");
319        let rels: Vec<&str> = SWITCH_INHIBITORS.iter().map(|(_, p)| *p).collect();
320        // Same targets except dbus.service - only dbus should fire.
321        make_fake_system(&cur, &rels, "cur");
322        share_targets(&cur, &new, &rels);
323        // Overwrite the dbus link in `new` to point somewhere else.
324        let dbus_rel = "etc/systemd/system/dbus.service";
325        let new_dbus_target = dir.path().join("targets-new-dbus");
326        std::fs::create_dir_all(&new_dbus_target).unwrap();
327        let new_dbus_file = new_dbus_target.join("dbus.service");
328        std::fs::write(&new_dbus_file, b"").unwrap();
329        let new_dbus_link = new.join(dbus_rel);
330        std::fs::remove_file(&new_dbus_link).unwrap();
331        std::os::unix::fs::symlink(&new_dbus_file, &new_dbus_link).unwrap();
332        assert_eq!(detect_switch_inhibitors(&cur, &new), Some("dbus"));
333    }
334
335    #[test]
336    fn detect_returns_none_when_one_side_missing_a_path() {
337        let dir = tempfile::tempdir().unwrap();
338        let cur = dir.path().join("current");
339        let new = dir.path().join("new");
340        // Only populate cur - new is empty, every canonicalize on new fails.
341        let rels: Vec<&str> = SWITCH_INHIBITORS.iter().map(|(_, p)| *p).collect();
342        make_fake_system(&cur, &rels, "cur");
343        std::fs::create_dir_all(&new).unwrap();
344        // Per the contract: missing path on one side is out-of-scope, returns None.
345        assert_eq!(detect_switch_inhibitors(&cur, &new), None);
346    }
347
348    /// Regression: `<closure>/init` is regenerated per-system, so an init-
349    /// only delta must NOT trigger defer (otherwise every update defers).
350    #[test]
351    fn detect_ignores_init_only_delta() {
352        let dir = tempfile::tempdir().unwrap();
353        let cur = dir.path().join("current");
354        let new = dir.path().join("new");
355        let rels: Vec<&str> = SWITCH_INHIBITORS.iter().map(|(_, p)| *p).collect();
356        make_fake_system(&cur, &rels, "cur");
357        share_targets(&cur, &new, &rels);
358        // Plant a differing `init` in `new` only - kernel/systemd/dbus stay
359        // equal. With init removed from SWITCH_INHIBITORS this MUST be None.
360        let init_target_dir = dir.path().join("targets-new-init");
361        std::fs::create_dir_all(&init_target_dir).unwrap();
362        let init_file = init_target_dir.join("init");
363        std::fs::write(&init_file, b"").unwrap();
364        std::os::unix::fs::symlink(&init_file, new.join("init")).unwrap();
365        assert_eq!(detect_switch_inhibitors(&cur, &new), None);
366    }
367
368    /// Catches re-ordering or short-circuit regressions if SWITCH_INHIBITORS
369    /// is reshuffled: kernel is the trailing entry and must still fire when
370    /// dbus + systemd match.
371    #[test]
372    fn detect_returns_kernel_when_kernel_differs_first() {
373        let dir = tempfile::tempdir().unwrap();
374        let cur = dir.path().join("current");
375        let new = dir.path().join("new");
376        let rels: Vec<&str> = SWITCH_INHIBITORS.iter().map(|(_, p)| *p).collect();
377        make_fake_system(&cur, &rels, "cur");
378        share_targets(&cur, &new, &rels);
379        let kernel_rel = "kernel";
380        let new_kernel_target = dir.path().join("targets-new-kernel");
381        std::fs::create_dir_all(&new_kernel_target).unwrap();
382        let new_kernel_file = new_kernel_target.join("bzImage");
383        std::fs::write(&new_kernel_file, b"").unwrap();
384        let new_kernel_link = new.join(kernel_rel);
385        std::fs::remove_file(&new_kernel_link).unwrap();
386        std::os::unix::fs::symlink(&new_kernel_file, &new_kernel_link).unwrap();
387        assert_eq!(detect_switch_inhibitors(&cur, &new), Some("kernel"));
388    }
389}