nixfleet_agent/runtime/workers/
cert_renewal.rs

1//! mTLS cert renewal worker. Reads the cert at `cfg.client_cert` on a
2//! timer, computes the remaining-validity fraction, and POSTs
3//! `/v1/agent/renew` when below `cfg.renewal_threshold_fraction`. The
4//! CSR is signed by the host's SSH ed25519 key (same key as
5//! enrollment per RFC-0003 §2), so the renewed cert's pubkey is
6//! unchanged.
7//!
8//! The renewed cert lands at the same on-disk path atomically via
9//! `enrollment::renew`. Running workers (heartbeat, longpoll,
10//! manifest_poll) keep their existing in-memory mTLS clients — those
11//! clients still hold the OLD cert bytes. Production: the agent's
12//! systemd unit has `Restart=always RestartSec=30`, so any restart
13//! event (next operator-driven nixos-rebuild, the cert-near-expiry
14//! handover, or any crash) loads the freshly-written cert into the
15//! workers' clients. The renewal worker exists to keep the on-disk
16//! cert from ever expiring; it does NOT hot-swap the running clients'
17//! identities.
18//!
19//! Disabled when `cfg.renewal_threshold_fraction.is_none()` — the
20//! worker logs a single info line and exits. Use this in tests where
21//! the cert is fixture-baked and renewal would be noise.
22
23use std::time::Duration;
24
25use nixfleet_proto::clock::ClockHandle;
26use tokio::task::JoinHandle;
27
28use super::super::{AgentConfig, ShutdownToken};
29
30/// Cadence at which the worker re-reads the cert + checks the
31/// threshold. 60s matches the heartbeat ticker; the renewal decision
32/// is cheap (parse PEM, compute fraction) so amortised cost is
33/// trivial. Independent of the renewal HTTP cost because that only
34/// fires when the threshold trips.
35const CHECK_INTERVAL: Duration = Duration::from_secs(60);
36
37const ERROR_BACKOFF: Duration = Duration::from_secs(30);
38
39pub fn spawn(
40    cfg: AgentConfig,
41    clock: ClockHandle,
42    shutdown: ShutdownToken,
43) -> JoinHandle<()> {
44    tokio::spawn(async move {
45        let mut shutdown_rx = shutdown.into_inner();
46        let Some(threshold) = cfg.renewal_threshold_fraction else {
47            tracing::info!(
48                target: "agent_cert_renewal",
49                "renewal_threshold_fraction unset — cert renewal worker disabled",
50            );
51            // Stay alive on the shutdown channel so the runtime's
52            // shutdown handshake doesn't see a missing receiver.
53            let _ = shutdown_rx.await;
54            return;
55        };
56        if !(0.0 < threshold && threshold < 1.0) {
57            tracing::error!(
58                target: "agent_cert_renewal",
59                threshold,
60                "renewal_threshold_fraction must be strictly between 0 and 1 — worker exiting",
61            );
62            let _ = shutdown_rx.await;
63            return;
64        }
65        let Some(cert_path) = cfg.client_cert.clone() else {
66            tracing::info!(
67                target: "agent_cert_renewal",
68                "client_cert unset — renewal worker has nothing to renew; exiting",
69            );
70            let _ = shutdown_rx.await;
71            return;
72        };
73
74        let client = match crate::comms::build_client(
75            cfg.ca_cert.as_deref(),
76            cfg.client_cert.as_deref(),
77            cfg.client_key.as_deref(),
78        ) {
79            Ok(c) => c,
80            Err(err) => {
81                tracing::error!(
82                    target: "agent_cert_renewal",
83                    error = %err,
84                    "failed to build mTLS HTTP client; worker exits",
85                );
86                let _ = shutdown_rx.await;
87                return;
88            }
89        };
90
91        let mut ticker = tokio::time::interval(CHECK_INTERVAL);
92        ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
93
94        loop {
95            tokio::select! {
96                biased;
97                _ = &mut shutdown_rx => {
98                    tracing::info!(
99                        target: "shutdown",
100                        task = "agent_cert_renewal",
101                        "task shut down",
102                    );
103                    return;
104                }
105                _ = ticker.tick() => {
106                    if let Err(err) = maybe_renew_once(
107                        &client,
108                        &cfg,
109                        &clock,
110                        &cert_path,
111                        threshold,
112                    ).await {
113                        tracing::warn!(
114                            target: "agent_cert_renewal",
115                            error = %err,
116                            "renewal check failed; backing off",
117                        );
118                        tokio::time::sleep(ERROR_BACKOFF).await;
119                    }
120                }
121            }
122        }
123    })
124}
125
126async fn maybe_renew_once(
127    client: &reqwest::Client,
128    cfg: &AgentConfig,
129    clock: &ClockHandle,
130    cert_path: &std::path::Path,
131    threshold: f64,
132) -> anyhow::Result<()> {
133    let (remaining, not_after) =
134        crate::enrollment::cert_remaining_fraction(cert_path, clock.now())?;
135    if remaining >= threshold {
136        return Ok(());
137    }
138    tracing::info!(
139        target: "agent_cert_renewal",
140        remaining,
141        threshold,
142        not_after = %not_after.to_rfc3339(),
143        "cert remaining fraction below threshold; renewing",
144    );
145    crate::enrollment::renew(
146        client,
147        &cfg.control_plane_url,
148        &cfg.machine_id,
149        cert_path,
150        &cfg.ssh_host_key_file,
151    )
152    .await?;
153    tracing::info!(
154        target: "agent_cert_renewal",
155        cert = %cert_path.display(),
156        "cert renewed — file rewritten; running workers continue with prior in-memory client \
157         until next agent restart picks up the new cert",
158    );
159    Ok(())
160}