nixfleet_control_plane/
metrics.rs

1//! Prometheus counters surface - minimum viable set for alerting.
2//!
3//! Three counters (auto-emit on event, monotonic) + one info gauge for
4//! `cp_build_info`. No state-as-label gauges, no master/detail panel
5//! drivers - those proved unworkable in Grafana's table model and were
6//! stripped (see fleet's `nixfleet-events.json` which now reads from
7//! Loki instead). What stays is the alerting surface:
8//!
9//!   - `nixfleet_compliance_failure_events_total{control_id, host}`  -
10//!     per-control, per-host. Cardinality bounded by the closed
11//!     compliance set (~16 controls) × hosts.
12//!   - `nixfleet_runtime_gate_error_events_total` - unlabeled. One
13//!     global counter for the "agent couldn't measure compliance"
14//!     class.
15//!   - `nixfleet_gate_block_total{gate}` - one increment per
16//!     `gates::evaluate_for_host` block. `gate` discriminator is one
17//!     of the kebab-case gate kinds (channel-edges / wave-promotion /
18//!     host-edge / disruption-budget / compliance-wave). Drives
19//!     `rate(...{gate="compliance-wave"}[5m]) > 0` style alerts.
20//!   - `nixfleet_cp_build_info{version, git_commit}=1` - one series.
21//!     Standard pattern (cf. `kube_pod_info`) for tracking the
22//!     deployed CP version across scrapes. Re-emitted every render
23//!     since the values are compile-time constants.
24//!
25//! When the `metrics` feature is disabled, all functions in this module
26//! are no-ops and neither dep is compiled in.
27//!
28//! The exporter recorder is process-global and idempotent - first
29//! `install_recorder()` wins. Tests can spin multiple test servers
30//! without colliding.
31//!
32//! `idle_timeout` deliberately NOT set: counters are cumulative and
33//! must NEVER reset; the previous version applied idle eviction to
34//! gauges, but with no gauges in this slim surface, idle eviction is
35//! moot. `cp_build_info` is the only gauge and it's re-emitted every
36//! scrape via `record_build_info()`.
37
38#[cfg(feature = "metrics")]
39use std::sync::OnceLock;
40
41#[cfg(feature = "metrics")]
42use metrics::{counter, gauge};
43#[cfg(feature = "metrics")]
44use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle};
45
46#[cfg(feature = "metrics")]
47static METRICS_HANDLE: OnceLock<PrometheusHandle> = OnceLock::new();
48
49/// Install the process-global Prometheus recorder. Idempotent - safe
50/// to call from each test's server-spawn helper.
51#[cfg(feature = "metrics")]
52pub fn install_recorder() -> &'static PrometheusHandle {
53    METRICS_HANDLE.get_or_init(|| {
54        PrometheusBuilder::new()
55            .install_recorder()
56            .expect("install Prometheus recorder")
57    })
58}
59
60/// Increment on `ComplianceFailure` event arrival in `/v1/agent/report`.
61/// Bounded labels: hosts × controls. No-op when `metrics` feature off.
62#[cfg(feature = "metrics")]
63pub fn record_compliance_event(control_id: &str, host: &str) {
64    counter!(
65        "nixfleet_compliance_failure_events_total",
66        "control_id" => control_id.to_string(),
67        "host" => host.to_string(),
68    )
69    .increment(1);
70}
71
72#[cfg(not(feature = "metrics"))]
73pub fn record_compliance_event(_control_id: &str, _host: &str) {}
74
75/// Increment on `RuntimeGateError` event arrival in `/v1/agent/report`.
76#[cfg(feature = "metrics")]
77pub fn record_runtime_gate_error() {
78    counter!("nixfleet_runtime_gate_error_events_total").increment(1);
79}
80
81#[cfg(not(feature = "metrics"))]
82pub fn record_runtime_gate_error() {}
83
84/// Increment when `gates::evaluate_for_host` returns `Some(GateBlock)`
85/// at the dispatch endpoint. `gate_kind` is the kebab-case
86/// discriminator (channel-edges / wave-promotion / host-edge /
87/// disruption-budget / compliance-wave).
88#[cfg(feature = "metrics")]
89pub fn record_gate_block(gate_kind: &str) {
90    counter!(
91        "nixfleet_gate_block_total",
92        "gate" => gate_kind.to_string(),
93    )
94    .increment(1);
95}
96
97#[cfg(not(feature = "metrics"))]
98pub fn record_gate_block(_gate_kind: &str) {}
99
100/// `cp_build_info{version, git_commit}=1` - the deployed CP version.
101/// Constants resolve at compile time; re-emit each scrape so it always
102/// renders.
103#[cfg(feature = "metrics")]
104pub fn record_build_info() {
105    gauge!(
106        "nixfleet_cp_build_info",
107        "version" => env!("CARGO_PKG_VERSION").to_string(),
108        "git_commit" => option_env!("GIT_COMMIT").unwrap_or("unknown").to_string(),
109    )
110    .set(1.0);
111}
112
113#[cfg(not(feature = "metrics"))]
114pub fn record_build_info() {}
115
116#[cfg(all(test, feature = "metrics"))]
117mod tests {
118    use super::*;
119
120    #[test]
121    fn install_recorder_is_idempotent() {
122        let h1 = install_recorder();
123        let h2 = install_recorder();
124        assert!(std::ptr::eq(h1, h2), "recorder must be process-global");
125    }
126}