stygian_browser/
metrics.rs

1//! Performance metrics for stygian-browser.
2//!
3//! Tracks browser pool utilisation, acquisition latency, crash rates, and
4//! process memory.  Metrics are exported in **Prometheus text format** via
5//! [`gather`].
6//!
7//! ## Enabling
8//!
9//! ```toml
10//! [dependencies]
11//! stygian-browser = { version = "0.1", features = ["metrics"] }
12//! ```
13//!
14//! ## Example
15//!
16//! ```no_run
17//! use stygian_browser::metrics::{gather, METRICS};
18//!
19//! // After your scraping loop:
20//! let report = gather();
21//! println!("{report}");
22//! ```
23//!
24//! ## Prometheus metrics
25//!
26//! | Name | Type | Description |
27//! | ------ | ------ | ------------- |
28//! | `browser_pool_size` | Gauge | Number of active browser instances |
29//! | `browser_acquisition_duration_seconds` | Histogram | Time to acquire a browser |
30//! | `browser_crashes_total` | Counter | Cumulative browser crashes |
31//! | `process_rss_bytes` | Gauge | Process resident set size (Linux only) |
32
33use std::sync::{LazyLock, Mutex};
34use std::time::Duration;
35
36use prometheus_client::{
37    encoding::text::encode,
38    metrics::{
39        counter::Counter,
40        gauge::Gauge,
41        histogram::{Histogram, exponential_buckets},
42    },
43    registry::Registry,
44};
45use tracing::{error, warn};
46
47// ─── Thresholds ──────────────────────────────────────────────────────────────
48
49/// Acquisition time beyond which a warning is emitted.
50const WARN_ACQUISITION_SECS: f64 = 1.0;
51
52/// Crash rate (crashes / acquisitions) beyond which an error is logged.
53const ALERT_CRASH_RATE_THRESHOLD: f64 = 0.10;
54
55// ─── Metrics container ───────────────────────────────────────────────────────
56
57/// Global Prometheus metrics for the browser pool.
58///
59/// Obtain a reference via the [`METRICS`] static.
60pub struct BrowserMetrics {
61    /// Active browser instance count.
62    pub pool_size: Gauge,
63    /// Histogram of browser acquisition durations in seconds.
64    pub acquisition_duration_seconds: Histogram,
65    /// Total browser crashes (unexpected terminations or health-check failures).
66    pub crashes_total: Counter,
67    /// Process RSS in bytes (Linux only; 0 on other platforms).
68    pub process_rss_bytes: Gauge,
69    /// Total acquisitions — used to compute crash rate.
70    acquisitions_total: Counter,
71    /// Prometheus text registry (mutex-guarded for `encode`).
72    registry: Mutex<Registry>,
73}
74
75impl BrowserMetrics {
76    fn new() -> Self {
77        // Histogram buckets: 5 ms → ~20 s (12 exponential buckets, factor 2)
78        let acquisition_duration_seconds = Histogram::new(exponential_buckets(0.005, 2.0, 12));
79        let pool_size = Gauge::default();
80        let crashes_total: Counter = Counter::default();
81        let acquisitions_total: Counter = Counter::default();
82        let process_rss_bytes = Gauge::default();
83
84        let mut registry = Registry::default();
85        registry.register(
86            "browser_pool_size",
87            "Number of active browser instances currently in use",
88            pool_size.clone(),
89        );
90        registry.register(
91            "browser_acquisition_duration_seconds",
92            "Time taken to acquire a browser instance from the pool",
93            acquisition_duration_seconds.clone(),
94        );
95        registry.register(
96            "browser_crashes_total",
97            "Cumulative number of browser crashes or health-check failures",
98            crashes_total.clone(),
99        );
100        registry.register(
101            "browser_acquisitions_total",
102            "Cumulative number of browser acquisition calls",
103            acquisitions_total.clone(),
104        );
105        registry.register(
106            "process_rss_bytes",
107            "Resident set size of the current process in bytes",
108            process_rss_bytes.clone(),
109        );
110
111        Self {
112            pool_size,
113            acquisition_duration_seconds,
114            crashes_total,
115            acquisitions_total,
116            process_rss_bytes,
117            registry: Mutex::new(registry),
118        }
119    }
120
121    /// Record a browser acquisition that took `duration`.
122    ///
123    /// Emits a warning if `duration` exceeds the 1-second performance budget.
124    /// Logs an error if the crash rate exceeds 10%.
125    pub fn record_acquisition(&self, duration: Duration) {
126        let secs = duration.as_secs_f64();
127        self.acquisition_duration_seconds.observe(secs);
128        self.acquisitions_total.inc();
129
130        if secs > WARN_ACQUISITION_SECS {
131            warn!(
132                elapsed_ms = duration.as_millis(),
133                "Browser acquisition exceeded 1s performance budget"
134            );
135        }
136
137        // Crash rate check
138        let crashes = self.crashes_total.get();
139        let acquires = self.acquisitions_total.get();
140        if acquires > 0 {
141            // Cap to u32 range so f64::from() is lossless (counts this large are never realistic)
142            let cap = u64::from(u32::MAX);
143            let rate = f64::from(u32::try_from(crashes.min(cap)).unwrap_or(u32::MAX))
144                / f64::from(u32::try_from(acquires.min(cap)).unwrap_or(u32::MAX));
145            if rate > ALERT_CRASH_RATE_THRESHOLD {
146                error!(
147                    crash_rate = format!("{:.1}%", rate * 100.0),
148                    crashes, acquires, "Browser crash rate exceeds 10% alert threshold"
149                );
150            }
151        }
152    }
153
154    /// Update the pool size gauge.
155    pub fn set_pool_size(&self, active: i64) {
156        self.pool_size.set(active);
157    }
158
159    /// Record a browser crash or health-check failure.
160    pub fn record_crash(&self) {
161        self.crashes_total.inc();
162    }
163
164    /// Refresh the process RSS gauge and return the current value in bytes.
165    ///
166    /// Returns `0` on platforms where `/proc/self/status` is unavailable.
167    pub fn refresh_rss(&self) -> i64 {
168        let rss = rss_bytes();
169        self.process_rss_bytes.set(rss);
170        rss
171    }
172
173    /// Encode all metrics as Prometheus text exposition format.
174    ///
175    /// # Errors
176    ///
177    /// Returns an empty string if encoding fails (registry mutex poisoned).
178    pub fn gather(&self) -> String {
179        self.refresh_rss();
180        let guard = match self.registry.lock() {
181            Ok(g) => g,
182            Err(e) => {
183                warn!("Metrics registry lock poisoned: {e}");
184                return String::new();
185            }
186        };
187        let mut buf = String::new();
188        if let Err(e) = encode(&mut buf, &guard) {
189            warn!("Failed to encode Prometheus metrics: {e}");
190        }
191        buf
192    }
193}
194
195// ─── Global singleton ────────────────────────────────────────────────────────
196
197/// Global metrics instance.
198///
199/// Use this to record acquisitions, crashes, or to produce the Prometheus
200/// text output via [`BrowserMetrics::gather`].
201pub static METRICS: LazyLock<BrowserMetrics> = LazyLock::new(BrowserMetrics::new);
202
203/// Convenience alias for [`METRICS.gather()`](BrowserMetrics::gather).
204///
205/// # Example
206///
207/// ```
208/// use stygian_browser::metrics::gather;
209/// let text = gather();
210/// assert!(text.contains("browser_pool_size"));
211/// ```
212pub fn gather() -> String {
213    METRICS.gather()
214}
215
216// ─── Platform-specific RSS ───────────────────────────────────────────────────
217
218/// Read process RSS from `/proc/self/status` (Linux) or return 0.
219// Not `const fn`: the Linux branch reads `/proc/self/status` (file I/O).
220// On other platforms clippy would suggest `const fn` because the body is just
221// `0`, but that would break cross-platform compilation.
222#[allow(clippy::missing_const_for_fn)]
223fn rss_bytes() -> i64 {
224    #[cfg(target_os = "linux")]
225    {
226        read_linux_rss().unwrap_or(0)
227    }
228    #[cfg(not(target_os = "linux"))]
229    {
230        0
231    }
232}
233
234#[cfg(target_os = "linux")]
235fn read_linux_rss() -> Option<i64> {
236    let status = std::fs::read_to_string("/proc/self/status").ok()?;
237    for line in status.lines() {
238        if let Some(rest) = line.strip_prefix("VmRSS:") {
239            let kb: i64 = rest
240                .split_whitespace()
241                .next()
242                .and_then(|s| s.parse().ok())?;
243            return Some(kb * 1024);
244        }
245    }
246    None
247}
248
249// ─── Tests ───────────────────────────────────────────────────────────────────
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254    use std::time::Duration;
255
256    fn fresh_metrics() -> BrowserMetrics {
257        BrowserMetrics::new()
258    }
259
260    #[test]
261    fn pool_size_gauge_tracks_value() {
262        let m = fresh_metrics();
263        m.set_pool_size(3);
264        assert_eq!(m.pool_size.get(), 3);
265        m.set_pool_size(0);
266        assert_eq!(m.pool_size.get(), 0);
267    }
268
269    #[test]
270    fn crash_counter_increments() {
271        let m = fresh_metrics();
272        m.record_crash();
273        m.record_crash();
274        assert_eq!(m.crashes_total.get(), 2);
275    }
276
277    #[test]
278    fn acquisition_duration_observes() {
279        let m = fresh_metrics();
280        m.record_acquisition(Duration::from_millis(100));
281        m.record_acquisition(Duration::from_millis(500));
282        // Acquisitions counter should be at 2
283        assert_eq!(m.acquisitions_total.get(), 2);
284    }
285
286    #[test]
287    fn gather_contains_metric_names() {
288        let m = fresh_metrics();
289        m.set_pool_size(2);
290        m.record_crash();
291        let output = m.gather();
292        assert!(output.contains("browser_pool_size"), "missing pool_size");
293        assert!(
294            output.contains("browser_crashes_total"),
295            "missing crashes_total"
296        );
297        assert!(
298            output.contains("browser_acquisition_duration_seconds"),
299            "missing acquisition histogram"
300        );
301    }
302
303    #[test]
304    fn global_gather_has_expected_keys() {
305        let output = gather();
306        assert!(output.contains("browser_pool_size"));
307    }
308
309    #[test]
310    fn rss_is_non_negative() {
311        // On any platform, RSS must be >= 0
312        assert!(rss_bytes() >= 0);
313    }
314}