nixfleet_agent/activation/
pipeline.rs

1//! Main activate pipeline: realise -> set-profile -> fire -> poll -> self-correct.
2
3use super::types::ActivationTarget;
4use anyhow::{Context, Result};
5use tokio::process::Command;
6
7use super::profile::self_correct_profile;
8use super::realise::{RealiseError, realise};
9use super::types::ActivationBackend;
10use super::types::ActivationOutcome;
11use super::verify_poll::{PollOutcome, VerifyPoll, read_current_system_basename};
12
13/// Tests inject a fake backend; production calls the `activate(target)` façade.
14pub async fn activate_with<B: ActivationBackend>(
15    backend: &B,
16    target: &ActivationTarget,
17) -> Result<ActivationOutcome> {
18    tracing::info!(
19        target_closure = %target.closure_hash,
20        target_channel = %target.channel_ref,
21        "agent: activating target",
22    );
23
24    // GOTCHA: racing an in-flight switch yields spurious SwitchFailed timeouts even on success.
25    if backend.is_switch_in_progress().await {
26        tracing::info!(
27            target_closure = %target.closure_hash,
28            "agent: skipping activation - another switch-to-configuration is in flight",
29        );
30        return Ok(ActivationOutcome::RealiseFailed {
31            reason: "switch-to-configuration lock held by another process; will retry on next tick"
32                .to_string(),
33        });
34    }
35
36    // LOADBEARING: realise forces fetch + sig verify; path-equality catches symlink/redirect surprises.
37    let store_path = format!("/nix/store/{}", target.closure_hash);
38    let realised = match realise(&store_path).await {
39        Ok(p) => p,
40        Err(RealiseError::SignatureMismatch { stderr_tail }) => {
41            tracing::error!(
42                target_closure = %target.closure_hash,
43                stderr_tail = %stderr_tail,
44                "agent: closure signature mismatch - refused by nix substituter trust",
45            );
46            return Ok(ActivationOutcome::SignatureMismatch {
47                closure_hash: target.closure_hash.clone(),
48                stderr_tail,
49            });
50        }
51        Err(RealiseError::Other(err)) => {
52            tracing::error!(
53                target_closure = %target.closure_hash,
54                error = %err,
55                "agent: realisation failed; not switching",
56            );
57            return Ok(ActivationOutcome::RealiseFailed {
58                reason: err.to_string(),
59            });
60        }
61    };
62
63    if realised != store_path {
64        tracing::error!(
65            target_closure = %target.closure_hash,
66            requested = %store_path,
67            realised = %realised,
68            "agent: nix-store --realise returned an unexpected path; not switching",
69        );
70        return Ok(ActivationOutcome::RealiseFailed {
71            reason: format!("realised path {realised} does not match requested {store_path}",),
72        });
73    }
74
75    // LOADBEARING: set profile BEFORE fire - switch-to-configuration {boot,switch,test}
76    // reads `/nix/var/nix/profiles/system` to derive the generation number it
77    // writes into bootloader entries. The bootloader update itself happens
78    // inside the backend's fire_switch (live switch on the happy path,
79    // explicit `switch-to-configuration boot` on the defer path).
80    let set_status = Command::new("nix-env")
81        .arg("--profile")
82        .arg("/nix/var/nix/profiles/system")
83        .arg("--set")
84        .arg(&store_path)
85        .status()
86        .await
87        .with_context(|| "spawn nix-env --set")?;
88
89    if !set_status.success() {
90        tracing::error!(
91            target_closure = %target.closure_hash,
92            exit_code = ?set_status.code(),
93            "agent: nix-env --set failed; not running switch-to-configuration",
94        );
95        return Ok(ActivationOutcome::SwitchFailed {
96            phase: "nix-env-set".to_string(),
97            exit_code: set_status.code(),
98        });
99    }
100
101    // LOADBEARING: pre-switch basename feeds flip-to-unexpected detection - abort if read fails.
102    let previous_basename = match read_current_system_basename().await {
103        Ok(b) => b,
104        Err(err) => {
105            tracing::error!(
106                target_closure = %target.closure_hash,
107                error = %err,
108                "agent: cannot read /run/current-system pre-switch; aborting activation",
109            );
110            return Ok(ActivationOutcome::RealiseFailed {
111                reason: format!("pre-switch /run/current-system read failed: {err}"),
112            });
113        }
114    };
115
116    if let Some(outcome) = backend.fire_switch(target, &store_path).await? {
117        return Ok(outcome);
118    }
119
120    // GOTCHA: SIGTERM mid-poll is OK - detached switch unit lands, boot-recovery confirms retroactively.
121    let expected = &target.closure_hash;
122    match VerifyPoll::new(expected)
123        .with_previous(&previous_basename)
124        .until_settled()
125        .await
126    {
127        PollOutcome::Settled => {
128            // GOTCHA: activation script may re-point profile after our set - self-correct or boot pointer drifts.
129            if let Err(err) = self_correct_profile(&store_path).await {
130                tracing::warn!(
131                    error = %err,
132                    "agent: profile self-correction failed (non-fatal); current-system OK so activation continues",
133                );
134            }
135            tracing::info!(
136                target_closure = %expected,
137                "agent: activation fire-and-forget complete (poll observed expected closure)",
138            );
139            Ok(ActivationOutcome::FiredAndPolled)
140        }
141        PollOutcome::Timeout { last_observed } => {
142            let exit_code = backend.read_unit_exit_code("nixfleet-switch.service").await;
143            tracing::error!(
144                target_closure = %expected,
145                last_observed = %last_observed,
146                exit_code = ?exit_code,
147                "agent: switch poll timed out - declaring SwitchFailed",
148            );
149            Ok(ActivationOutcome::SwitchFailed {
150                phase: "switch-poll-timeout".to_string(),
151                exit_code,
152            })
153        }
154        PollOutcome::FlippedToUnexpected { observed } => {
155            tracing::error!(
156                target_closure = %expected,
157                actual = %observed,
158                previous = %previous_basename,
159                "agent: post-switch verify caught flip to unexpected closure - rolling back",
160            );
161            Ok(ActivationOutcome::VerifyMismatch {
162                expected: expected.clone(),
163                actual: observed,
164            })
165        }
166    }
167}