stygian_browser/
metrics.rs1use std::sync::{LazyLock, Mutex};
34use std::time::Duration;
35
36use prometheus_client::{
37 encoding::text::encode,
38 metrics::{
39 counter::Counter,
40 gauge::Gauge,
41 histogram::{Histogram, exponential_buckets},
42 },
43 registry::Registry,
44};
45use tracing::{error, warn};
46
47const WARN_ACQUISITION_SECS: f64 = 1.0;
51
52const ALERT_CRASH_RATE_THRESHOLD: f64 = 0.10;
54
55pub struct BrowserMetrics {
61 pub pool_size: Gauge,
63 pub acquisition_duration_seconds: Histogram,
65 pub crashes_total: Counter,
67 pub process_rss_bytes: Gauge,
69 acquisitions_total: Counter,
71 registry: Mutex<Registry>,
73}
74
75impl BrowserMetrics {
76 fn new() -> Self {
77 let acquisition_duration_seconds = Histogram::new(exponential_buckets(0.005, 2.0, 12));
79 let pool_size = Gauge::default();
80 let crashes_total: Counter = Counter::default();
81 let acquisitions_total: Counter = Counter::default();
82 let process_rss_bytes = Gauge::default();
83
84 let mut registry = Registry::default();
85 registry.register(
86 "browser_pool_size",
87 "Number of active browser instances currently in use",
88 pool_size.clone(),
89 );
90 registry.register(
91 "browser_acquisition_duration_seconds",
92 "Time taken to acquire a browser instance from the pool",
93 acquisition_duration_seconds.clone(),
94 );
95 registry.register(
96 "browser_crashes_total",
97 "Cumulative number of browser crashes or health-check failures",
98 crashes_total.clone(),
99 );
100 registry.register(
101 "browser_acquisitions_total",
102 "Cumulative number of browser acquisition calls",
103 acquisitions_total.clone(),
104 );
105 registry.register(
106 "process_rss_bytes",
107 "Resident set size of the current process in bytes",
108 process_rss_bytes.clone(),
109 );
110
111 Self {
112 pool_size,
113 acquisition_duration_seconds,
114 crashes_total,
115 acquisitions_total,
116 process_rss_bytes,
117 registry: Mutex::new(registry),
118 }
119 }
120
121 pub fn record_acquisition(&self, duration: Duration) {
126 let secs = duration.as_secs_f64();
127 self.acquisition_duration_seconds.observe(secs);
128 self.acquisitions_total.inc();
129
130 if secs > WARN_ACQUISITION_SECS {
131 warn!(
132 elapsed_ms = duration.as_millis(),
133 "Browser acquisition exceeded 1s performance budget"
134 );
135 }
136
137 let crashes = self.crashes_total.get();
139 let acquires = self.acquisitions_total.get();
140 if acquires > 0 {
141 let cap = u64::from(u32::MAX);
143 let rate = f64::from(u32::try_from(crashes.min(cap)).unwrap_or(u32::MAX))
144 / f64::from(u32::try_from(acquires.min(cap)).unwrap_or(u32::MAX));
145 if rate > ALERT_CRASH_RATE_THRESHOLD {
146 error!(
147 crash_rate = format!("{:.1}%", rate * 100.0),
148 crashes, acquires, "Browser crash rate exceeds 10% alert threshold"
149 );
150 }
151 }
152 }
153
154 pub fn set_pool_size(&self, active: i64) {
156 self.pool_size.set(active);
157 }
158
159 pub fn record_crash(&self) {
161 self.crashes_total.inc();
162 }
163
164 pub fn refresh_rss(&self) -> i64 {
168 let rss = rss_bytes();
169 self.process_rss_bytes.set(rss);
170 rss
171 }
172
173 pub fn gather(&self) -> String {
179 self.refresh_rss();
180 let guard = match self.registry.lock() {
181 Ok(g) => g,
182 Err(e) => {
183 warn!("Metrics registry lock poisoned: {e}");
184 return String::new();
185 }
186 };
187 let mut buf = String::new();
188 if let Err(e) = encode(&mut buf, &guard) {
189 warn!("Failed to encode Prometheus metrics: {e}");
190 }
191 buf
192 }
193}
194
195pub static METRICS: LazyLock<BrowserMetrics> = LazyLock::new(BrowserMetrics::new);
202
203pub fn gather() -> String {
213 METRICS.gather()
214}
215
216#[allow(clippy::missing_const_for_fn)]
223fn rss_bytes() -> i64 {
224 #[cfg(target_os = "linux")]
225 {
226 read_linux_rss().unwrap_or(0)
227 }
228 #[cfg(not(target_os = "linux"))]
229 {
230 0
231 }
232}
233
234#[cfg(target_os = "linux")]
235fn read_linux_rss() -> Option<i64> {
236 let status = std::fs::read_to_string("/proc/self/status").ok()?;
237 for line in status.lines() {
238 if let Some(rest) = line.strip_prefix("VmRSS:") {
239 let kb: i64 = rest
240 .split_whitespace()
241 .next()
242 .and_then(|s| s.parse().ok())?;
243 return Some(kb * 1024);
244 }
245 }
246 None
247}
248
249#[cfg(test)]
252mod tests {
253 use super::*;
254 use std::time::Duration;
255
256 fn fresh_metrics() -> BrowserMetrics {
257 BrowserMetrics::new()
258 }
259
260 #[test]
261 fn pool_size_gauge_tracks_value() {
262 let m = fresh_metrics();
263 m.set_pool_size(3);
264 assert_eq!(m.pool_size.get(), 3);
265 m.set_pool_size(0);
266 assert_eq!(m.pool_size.get(), 0);
267 }
268
269 #[test]
270 fn crash_counter_increments() {
271 let m = fresh_metrics();
272 m.record_crash();
273 m.record_crash();
274 assert_eq!(m.crashes_total.get(), 2);
275 }
276
277 #[test]
278 fn acquisition_duration_observes() {
279 let m = fresh_metrics();
280 m.record_acquisition(Duration::from_millis(100));
281 m.record_acquisition(Duration::from_millis(500));
282 assert_eq!(m.acquisitions_total.get(), 2);
284 }
285
286 #[test]
287 fn gather_contains_metric_names() {
288 let m = fresh_metrics();
289 m.set_pool_size(2);
290 m.record_crash();
291 let output = m.gather();
292 assert!(output.contains("browser_pool_size"), "missing pool_size");
293 assert!(
294 output.contains("browser_crashes_total"),
295 "missing crashes_total"
296 );
297 assert!(
298 output.contains("browser_acquisition_duration_seconds"),
299 "missing acquisition histogram"
300 );
301 }
302
303 #[test]
304 fn global_gather_has_expected_keys() {
305 let output = gather();
306 assert!(output.contains("browser_pool_size"));
307 }
308
309 #[test]
310 fn rss_is_non_negative() {
311 assert!(rss_bytes() >= 0);
313 }
314}