nixfleet_control_plane/metrics.rs
1//! Prometheus counters surface - minimum viable set for alerting.
2//!
3//! Three counters (auto-emit on event, monotonic) + one info gauge for
4//! `cp_build_info`. No state-as-label gauges, no master/detail panel
5//! drivers - those proved unworkable in Grafana's table model and were
6//! stripped (see fleet's `nixfleet-events.json` which now reads from
7//! Loki instead). What stays is the alerting surface:
8//!
9//! - `nixfleet_compliance_failure_events_total{control_id, host}` -
10//! per-control, per-host. Cardinality bounded by the closed
11//! compliance set (~16 controls) × hosts.
12//! - `nixfleet_runtime_gate_error_events_total` - unlabeled. One
13//! global counter for the "agent couldn't measure compliance"
14//! class.
15//! - `nixfleet_gate_block_total{gate}` - one increment per
16//! `gates::evaluate_for_host` block. `gate` discriminator is one
17//! of the kebab-case gate kinds (channel-edges / wave-promotion /
18//! host-edge / disruption-budget / compliance-wave). Drives
19//! `rate(...{gate="compliance-wave"}[5m]) > 0` style alerts.
20//! - `nixfleet_cp_build_info{version, git_commit}=1` - one series.
21//! Standard pattern (cf. `kube_pod_info`) for tracking the
22//! deployed CP version across scrapes. Re-emitted every render
23//! since the values are compile-time constants.
24//!
25//! When the `metrics` feature is disabled, all functions in this module
26//! are no-ops and neither dep is compiled in.
27//!
28//! The exporter recorder is process-global and idempotent - first
29//! `install_recorder()` wins. Tests can spin multiple test servers
30//! without colliding.
31//!
32//! `idle_timeout` deliberately NOT set: counters are cumulative and
33//! must NEVER reset; the previous version applied idle eviction to
34//! gauges, but with no gauges in this slim surface, idle eviction is
35//! moot. `cp_build_info` is the only gauge and it's re-emitted every
36//! scrape via `record_build_info()`.
37
38#[cfg(feature = "metrics")]
39use std::sync::OnceLock;
40
41#[cfg(feature = "metrics")]
42use metrics::{counter, gauge};
43#[cfg(feature = "metrics")]
44use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle};
45
46#[cfg(feature = "metrics")]
47static METRICS_HANDLE: OnceLock<PrometheusHandle> = OnceLock::new();
48
49/// Install the process-global Prometheus recorder. Idempotent - safe
50/// to call from each test's server-spawn helper.
51#[cfg(feature = "metrics")]
52pub fn install_recorder() -> &'static PrometheusHandle {
53 METRICS_HANDLE.get_or_init(|| {
54 PrometheusBuilder::new()
55 .install_recorder()
56 .expect("install Prometheus recorder")
57 })
58}
59
60/// Increment on `ComplianceFailure` event arrival in `/v1/agent/report`.
61/// Bounded labels: hosts × controls. No-op when `metrics` feature off.
62#[cfg(feature = "metrics")]
63pub fn record_compliance_event(control_id: &str, host: &str) {
64 counter!(
65 "nixfleet_compliance_failure_events_total",
66 "control_id" => control_id.to_string(),
67 "host" => host.to_string(),
68 )
69 .increment(1);
70}
71
72#[cfg(not(feature = "metrics"))]
73pub fn record_compliance_event(_control_id: &str, _host: &str) {}
74
75/// Increment on `RuntimeGateError` event arrival in `/v1/agent/report`.
76#[cfg(feature = "metrics")]
77pub fn record_runtime_gate_error() {
78 counter!("nixfleet_runtime_gate_error_events_total").increment(1);
79}
80
81#[cfg(not(feature = "metrics"))]
82pub fn record_runtime_gate_error() {}
83
84/// Increment when `gates::evaluate_for_host` returns `Some(GateBlock)`
85/// at the dispatch endpoint. `gate_kind` is the kebab-case
86/// discriminator (channel-edges / wave-promotion / host-edge /
87/// disruption-budget / compliance-wave).
88#[cfg(feature = "metrics")]
89pub fn record_gate_block(gate_kind: &str) {
90 counter!(
91 "nixfleet_gate_block_total",
92 "gate" => gate_kind.to_string(),
93 )
94 .increment(1);
95}
96
97#[cfg(not(feature = "metrics"))]
98pub fn record_gate_block(_gate_kind: &str) {}
99
100/// `cp_build_info{version, git_commit}=1` - the deployed CP version.
101/// Constants resolve at compile time; re-emit each scrape so it always
102/// renders.
103#[cfg(feature = "metrics")]
104pub fn record_build_info() {
105 gauge!(
106 "nixfleet_cp_build_info",
107 "version" => env!("CARGO_PKG_VERSION").to_string(),
108 "git_commit" => option_env!("GIT_COMMIT").unwrap_or("unknown").to_string(),
109 )
110 .set(1.0);
111}
112
113#[cfg(not(feature = "metrics"))]
114pub fn record_build_info() {}
115
116#[cfg(all(test, feature = "metrics"))]
117mod tests {
118 use super::*;
119
120 #[test]
121 fn install_recorder_is_idempotent() {
122 let h1 = install_recorder();
123 let h2 = install_recorder();
124 assert!(std::ptr::eq(h1, h2), "recorder must be process-global");
125 }
126}