stygian_browser/
recorder.rs

1//! Browser session recording and debugging tools.
2//!
3//! Captures CDP events, network traffic, and performance metrics for
4//! debugging failed scraping runs, analysing anti-bot detection, and
5//! performance profiling.
6//!
7//! ## Configuration
8//!
9//! | Variable | Default | Description |
10//! | ---------- | --------- | ------------- |
11//! | `STYGIAN_RECORD_SESSION` | `false` | Enable recording automatically |
12//! | `STYGIAN_RECORD_DIR` | `./recordings` | Output directory |
13//!
14//! ## HAR export
15//!
16//! Records all network requests in the
17//! [HTTP Archive (HAR 1.2)](https://w3c.github.io/web-performance/specs/HAR/Overview.html)
18//! format, which can be opened in Chrome `DevTools`, Fiddler, or analysed
19//! programmatically.
20//!
21//! ## Example
22//!
23//! ```no_run
24//! use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
25//! use stygian_browser::recorder::{SessionRecorder, RecorderConfig};
26//! use std::time::Duration;
27//!
28//! # async fn run() -> stygian_browser::error::Result<()> {
29//! let pool = BrowserPool::new(BrowserConfig::default()).await?;
30//! let handle = pool.acquire().await?;
31//! let mut page = handle.browser().expect("valid browser").new_page().await?;
32//!
33//! let mut recorder = SessionRecorder::start(RecorderConfig::default());
34//! page.navigate("https://example.com", WaitUntil::Selector("body".to_string()), Duration::from_secs(30)).await?;
35//!
36//! // Log a CDP event manually
37//! recorder.record_event("Page.loadEventFired", serde_json::json!({"timestamp": 1234.5}));
38//!
39//! // Export HAR
40//! recorder.stop();
41//! recorder.export_har("session.har")?;
42//! # Ok(())
43//! # }
44//! ```
45
46use std::{
47    collections::HashMap,
48    path::Path,
49    sync::atomic::{AtomicBool, Ordering},
50    time::{Duration, Instant, SystemTime, UNIX_EPOCH},
51};
52
53use serde::{Deserialize, Serialize};
54use serde_json::Value;
55use tracing::debug;
56
57use crate::error::{BrowserError, Result};
58
59// ─── RecorderConfig ───────────────────────────────────────────────────────────
60
61/// Configuration for a [`SessionRecorder`].
62#[derive(Debug, Clone)]
63pub struct RecorderConfig {
64    /// Directory to write recording files to.
65    pub output_dir: std::path::PathBuf,
66    /// Maximum number of CDP events to buffer (older events are dropped first).
67    pub max_events: usize,
68    /// Maximum number of network entries to buffer.
69    pub max_network_entries: usize,
70}
71
72impl Default for RecorderConfig {
73    fn default() -> Self {
74        let output_dir = std::env::var("STYGIAN_RECORD_DIR").map_or_else(
75            |_| std::path::PathBuf::from("./recordings"),
76            std::path::PathBuf::from,
77        );
78
79        Self {
80            output_dir,
81            max_events: 10_000,
82            max_network_entries: 5_000,
83        }
84    }
85}
86
87// ─── CDP event log ────────────────────────────────────────────────────────────
88
89/// A single recorded CDP event.
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct CdpEvent {
92    /// Monotonic offset from the recorder start in milliseconds.
93    pub elapsed_ms: u64,
94    /// CDP method name (e.g. `"Network.requestWillBeSent"`).
95    pub method: String,
96    /// Event payload as JSON.
97    pub params: Value,
98}
99
100// ─── HAR types ────────────────────────────────────────────────────────────────
101
102/// HAR 1.2 root object.
103#[derive(Debug, Serialize, Deserialize)]
104pub struct Har {
105    /// HAR root.
106    pub log: HarLog,
107}
108
109/// HAR log.
110#[derive(Debug, Serialize, Deserialize)]
111pub struct HarLog {
112    /// HAR version.
113    pub version: String,
114    /// Creator metadata.
115    pub creator: HarCreator,
116    /// List of HTTP transactions.
117    pub entries: Vec<HarEntry>,
118}
119
120/// HAR creator metadata.
121#[derive(Debug, Serialize, Deserialize)]
122pub struct HarCreator {
123    /// Name.
124    pub name: String,
125    /// Version.
126    pub version: String,
127}
128
129/// A single HAR network entry (request + response).
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct HarEntry {
132    /// ISO 8601 timestamp of the request start.
133    pub started_date_time: String,
134    /// Total elapsed time in milliseconds.
135    pub time: f64,
136    /// HTTP request.
137    pub request: HarRequest,
138    /// HTTP response.
139    pub response: HarResponse,
140    /// Additional timing details.
141    pub timings: HarTimings,
142}
143
144/// A HAR HTTP request.
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct HarRequest {
147    /// HTTP method.
148    pub method: String,
149    /// Full request URL.
150    pub url: String,
151    /// HTTP version (e.g. `"HTTP/1.1"`).
152    pub http_version: String,
153    /// Request headers.
154    pub headers: Vec<HarHeader>,
155    /// Query string parameters.
156    pub query_string: Vec<HarQueryParam>,
157    /// Total bytes transferred.
158    pub headers_size: i64,
159    /// POST body size (-1 = unknown).
160    pub body_size: i64,
161}
162
163/// A HAR HTTP response.
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct HarResponse {
166    /// HTTP status code.
167    pub status: u16,
168    /// Status text (e.g. `"OK"`).
169    pub status_text: String,
170    /// HTTP version.
171    pub http_version: String,
172    /// Response headers.
173    pub headers: Vec<HarHeader>,
174    /// MIME type of response body.
175    pub content_mime_type: String,
176    /// Response body size in bytes (-1 = unknown).
177    pub body_size: i64,
178}
179
180/// A single HTTP header.
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct HarHeader {
183    /// Header name.
184    pub name: String,
185    /// Header value.
186    pub value: String,
187}
188
189/// Query string parameter.
190#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct HarQueryParam {
192    /// Parameter name.
193    pub name: String,
194    /// Parameter value.
195    pub value: String,
196}
197
198/// HAR timing breakdown.
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct HarTimings {
201    /// Time to receive response (ms).
202    pub receive: f64,
203}
204
205// ─── Network entry (internal) ─────────────────────────────────────────────────
206
207/// Internal representation of a recorded network transaction.
208#[derive(Debug, Clone)]
209struct NetworkEntry {
210    started_at: Instant,
211    started_iso: String,
212    #[allow(dead_code)]
213    request_id: String,
214    method: String,
215    url: String,
216    request_headers: Vec<HarHeader>,
217    status: u16,
218    status_text: String,
219    response_headers: Vec<HarHeader>,
220    mime_type: String,
221    encoded_data_length: i64,
222}
223
224/// Returns the current time as an ISO 8601 string.
225fn iso_timestamp() -> String {
226    let d = SystemTime::now()
227        .duration_since(UNIX_EPOCH)
228        .unwrap_or(Duration::ZERO);
229    let secs = d.as_secs();
230    let millis = d.subsec_millis();
231    // Simple ISO 8601 (no chrono dep): YYYY-MM-DDTHH:MM:SS.mmmZ
232    let s = secs % 60;
233    let m = (secs / 60) % 60;
234    let h = (secs / 3600) % 24;
235    let days = secs / 86400;
236    // Epoch is 1970-01-01 — compute approximate date (no tz, no leap seconds)
237    let (year, month, day) = epoch_days_to_ymd(days);
238    format!("{year:04}-{month:02}-{day:02}T{h:02}:{m:02}:{s:02}.{millis:03}Z")
239}
240
241/// Very small epoch-days → (y, m, d) conversion (Gregorian, UTC approximate).
242fn epoch_days_to_ymd(days: u64) -> (u32, u32, u32) {
243    // 400-year cycle = 146097 days
244    let d = i64::try_from(days)
245        .unwrap_or(i64::MAX)
246        .saturating_add(719_468); // offset to 0000-03-01
247    let era = d.div_euclid(146_097);
248    let doe = d.rem_euclid(146_097);
249    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365;
250    let y = yoe + era * 400;
251    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
252    let mp = (5 * doy + 2) / 153;
253    let day = doy - (153 * mp + 2) / 5 + 1;
254    let month = if mp < 10 { mp + 3 } else { mp - 9 };
255    let year = if month <= 2 { y + 1 } else { y };
256    (
257        u32::try_from(year).unwrap_or(9999),
258        u32::try_from(month).unwrap_or(12),
259        u32::try_from(day).unwrap_or(31),
260    )
261}
262
263/// Parse query string `?k=v&k2=v2` into HAR params.
264fn parse_query(url: &str) -> Vec<HarQueryParam> {
265    let query = url.split_once('?').map_or("", |(_, q)| q);
266    query
267        .split('&')
268        .filter(|p| !p.is_empty())
269        .filter_map(|p| p.split_once('='))
270        .map(|(k, v)| HarQueryParam {
271            name: k.to_string(),
272            value: v.to_string(),
273        })
274        .collect()
275}
276
277// ─── SessionRecorder ──────────────────────────────────────────────────────────
278
279/// Records CDP events and network traffic during a browser session.
280///
281/// Create one per scraping job, call [`record_event`](Self::record_event) for
282/// each CDP event you want to log, then call [`stop`](Self::stop) and
283/// [`export_har`](Self::export_har) when the session ends.
284pub struct SessionRecorder {
285    config: RecorderConfig,
286    start: Instant,
287    running: AtomicBool,
288    events: std::sync::Mutex<Vec<CdpEvent>>,
289    /// Pending requests by requestId
290    pending: std::sync::Mutex<HashMap<String, NetworkEntry>>,
291    /// Completed network transactions
292    completed: std::sync::Mutex<Vec<NetworkEntry>>,
293}
294
295impl SessionRecorder {
296    /// Start a new recorder with the given `config`.
297    pub fn start(config: RecorderConfig) -> Self {
298        debug!("SessionRecorder started");
299        Self {
300            config,
301            start: Instant::now(),
302            running: AtomicBool::new(true),
303            events: std::sync::Mutex::new(Vec::new()),
304            pending: std::sync::Mutex::new(HashMap::new()),
305            completed: std::sync::Mutex::new(Vec::new()),
306        }
307    }
308
309    /// Returns `true` if the recorder is currently running.
310    pub fn is_running(&self) -> bool {
311        self.running.load(Ordering::Relaxed)
312    }
313
314    /// Stop the recorder. No more events will be buffered after this.
315    pub fn stop(&self) {
316        self.running.store(false, Ordering::Relaxed);
317        debug!("SessionRecorder stopped");
318    }
319
320    /// Record a raw CDP event by method name and parameters.
321    ///
322    /// Call this for every CDP event you receive from the browser.
323    /// The recorder automatically tracks `Network.requestWillBeSent` and
324    /// `Network.responseReceived` events to build HAR entries.
325    pub fn record_event(&self, method: &str, params: Value) {
326        if !self.is_running() {
327            return;
328        }
329
330        let elapsed_ms = u64::try_from(self.start.elapsed().as_millis()).unwrap_or(u64::MAX);
331
332        // Handle network events for HAR building
333        match method {
334            "Network.requestWillBeSent" => self.on_request_sent(&params, elapsed_ms),
335            "Network.responseReceived" => self.on_response_received(&params),
336            "Network.loadingFinished" => self.on_loading_finished(&params),
337            _ => {}
338        }
339
340        let Ok(mut guard) = self.events.lock() else {
341            return;
342        };
343
344        if guard.len() >= self.config.max_events {
345            guard.remove(0);
346        }
347        guard.push(CdpEvent {
348            elapsed_ms,
349            method: method.to_string(),
350            params,
351        });
352    }
353
354    /// Export the buffered CDP event log as a newline-delimited JSON file.
355    ///
356    /// # Errors
357    ///
358    /// Returns an IO error if the file cannot be written.
359    pub fn export_event_log(&self, path: impl AsRef<Path>) -> Result<()> {
360        let guard = self
361            .events
362            .lock()
363            .map_err(|_| BrowserError::ConfigError("event log lock poisoned".to_string()))?;
364
365        let mut lines: Vec<String> = Vec::with_capacity(guard.len());
366        for event in guard.iter() {
367            if let Ok(s) = serde_json::to_string(event) {
368                lines.push(s);
369            }
370        }
371        drop(guard);
372
373        std::fs::write(path, lines.join("\n")).map_err(BrowserError::Io)
374    }
375
376    /// Export captured network transactions as a HAR 1.2 file.
377    ///
378    /// # Errors
379    ///
380    /// Returns an IO or serialisation error if the file cannot be written.
381    pub fn export_har(&self, path: impl AsRef<Path>) -> Result<()> {
382        let har = self.build_har();
383        let json = serde_json::to_string_pretty(&har)
384            .map_err(|e| BrowserError::ConfigError(format!("Failed to serialise HAR: {e}")))?;
385        std::fs::create_dir_all(path.as_ref().parent().unwrap_or_else(|| Path::new(".")))
386            .map_err(BrowserError::Io)?;
387        std::fs::write(path, json).map_err(BrowserError::Io)
388    }
389
390    /// Return the number of buffered CDP events.
391    pub fn event_count(&self) -> usize {
392        self.events.lock().map(|g| g.len()).unwrap_or(0)
393    }
394
395    /// Return the number of completed network entries.
396    pub fn network_entry_count(&self) -> usize {
397        self.completed.lock().map(|g| g.len()).unwrap_or(0)
398    }
399
400    // ── Network event handlers ─────────────────────────────────────────────────
401
402    fn on_request_sent(&self, params: &Value, _elapsed_ms: u64) {
403        let request_id = params
404            .get("requestId")
405            .and_then(|v| v.as_str())
406            .unwrap_or("")
407            .to_string();
408        let method = params
409            .pointer("/request/method")
410            .and_then(|v| v.as_str())
411            .unwrap_or("GET")
412            .to_string();
413        let url = params
414            .pointer("/request/url")
415            .and_then(|v| v.as_str())
416            .unwrap_or("")
417            .to_string();
418
419        let request_headers: Vec<HarHeader> = params
420            .pointer("/request/headers")
421            .and_then(|v| v.as_object())
422            .into_iter()
423            .flat_map(|m| {
424                m.iter().map(|(k, v)| HarHeader {
425                    name: k.clone(),
426                    value: v.as_str().unwrap_or("").to_string(),
427                })
428            })
429            .collect();
430
431        let entry = NetworkEntry {
432            started_at: Instant::now(),
433            started_iso: iso_timestamp(),
434            request_id: request_id.clone(),
435            method,
436            url,
437            request_headers,
438            status: 0,
439            status_text: String::new(),
440            response_headers: vec![],
441            mime_type: String::new(),
442            encoded_data_length: -1,
443        };
444
445        if let Ok(mut guard) = self.pending.lock() {
446            guard.insert(request_id, entry);
447        }
448    }
449
450    fn on_response_received(&self, params: &Value) {
451        let request_id = params
452            .get("requestId")
453            .and_then(|v| v.as_str())
454            .unwrap_or("")
455            .to_string();
456
457        let status = u16::try_from(
458            params
459                .pointer("/response/status")
460                .and_then(serde_json::Value::as_u64)
461                .unwrap_or(0),
462        )
463        .unwrap_or(0);
464        let status_text = params
465            .pointer("/response/statusText")
466            .and_then(|v| v.as_str())
467            .unwrap_or("")
468            .to_string();
469        let mime_type = params
470            .pointer("/response/mimeType")
471            .and_then(|v| v.as_str())
472            .unwrap_or("")
473            .to_string();
474        let response_headers: Vec<HarHeader> = params
475            .pointer("/response/headers")
476            .and_then(|v| v.as_object())
477            .into_iter()
478            .flat_map(|m| {
479                m.iter().map(|(k, v)| HarHeader {
480                    name: k.clone(),
481                    value: v.as_str().unwrap_or("").to_string(),
482                })
483            })
484            .collect();
485
486        if let Ok(mut guard) = self.pending.lock()
487            && let Some(entry) = guard.get_mut(&request_id)
488        {
489            entry.status = status;
490            entry.status_text = status_text;
491            entry.mime_type = mime_type;
492            entry.response_headers = response_headers;
493        }
494    }
495
496    fn on_loading_finished(&self, params: &Value) {
497        let request_id = params
498            .get("requestId")
499            .and_then(|v| v.as_str())
500            .unwrap_or("")
501            .to_string();
502        let encoded_data_length = params
503            .get("encodedDataLength")
504            .and_then(serde_json::Value::as_i64)
505            .unwrap_or(-1);
506
507        let Ok(mut pending_guard) = self.pending.lock() else {
508            return;
509        };
510
511        if let Some(mut entry) = pending_guard.remove(&request_id) {
512            entry.encoded_data_length = encoded_data_length;
513            if let Ok(mut completed) = self.completed.lock() {
514                if completed.len() >= self.config.max_network_entries {
515                    completed.remove(0);
516                }
517                completed.push(entry);
518            }
519        }
520    }
521
522    fn build_har(&self) -> Har {
523        let completed = self.completed.lock().map(|g| g.clone()).unwrap_or_default();
524
525        let entries: Vec<HarEntry> = completed
526            .into_iter()
527            .map(|entry| {
528                let elapsed = entry.started_at.elapsed().as_secs_f64() * 1000.0;
529                let query_string = parse_query(&entry.url);
530                HarEntry {
531                    started_date_time: entry.started_iso.clone(),
532                    time: elapsed,
533                    request: HarRequest {
534                        method: entry.method,
535                        url: entry.url,
536                        http_version: "HTTP/1.1".to_string(),
537                        headers: entry.request_headers,
538                        query_string,
539                        headers_size: -1,
540                        body_size: -1,
541                    },
542                    response: HarResponse {
543                        status: entry.status,
544                        status_text: entry.status_text,
545                        http_version: "HTTP/1.1".to_string(),
546                        headers: entry.response_headers,
547                        content_mime_type: entry.mime_type,
548                        body_size: entry.encoded_data_length,
549                    },
550                    timings: HarTimings { receive: elapsed },
551                }
552            })
553            .collect();
554
555        Har {
556            log: HarLog {
557                version: "1.2".to_string(),
558                creator: HarCreator {
559                    name: "stygian-browser".to_string(),
560                    version: env!("CARGO_PKG_VERSION").to_string(),
561                },
562                entries,
563            },
564        }
565    }
566}
567
568// ─── Convenience helpers ──────────────────────────────────────────────────────
569
570/// Returns `true` if session recording is enabled via `STYGIAN_RECORD_SESSION`.
571pub fn is_recording_enabled() -> bool {
572    matches!(
573        std::env::var("STYGIAN_RECORD_SESSION")
574            .unwrap_or_default()
575            .to_lowercase()
576            .as_str(),
577        "true" | "1" | "yes"
578    )
579}
580
581// ─── Tests ───────────────────────────────────────────────────────────────────
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586
587    fn test_recorder() -> SessionRecorder {
588        SessionRecorder::start(RecorderConfig {
589            output_dir: std::env::temp_dir(),
590            max_events: 100,
591            max_network_entries: 50,
592        })
593    }
594
595    #[test]
596    fn recorder_starts_running() {
597        let r = test_recorder();
598        assert!(r.is_running());
599    }
600
601    #[test]
602    fn recorder_stops() {
603        let r = test_recorder();
604        r.stop();
605        assert!(!r.is_running());
606    }
607
608    #[test]
609    fn records_events_while_running() {
610        let r = test_recorder();
611        r.record_event("Page.loadEventFired", serde_json::json!({"timestamp": 1.0}));
612        r.record_event("Page.frameNavigated", serde_json::json!({}));
613        assert_eq!(r.event_count(), 2);
614    }
615
616    #[test]
617    fn does_not_record_after_stop() {
618        let r = test_recorder();
619        r.stop();
620        r.record_event("Page.loadEventFired", serde_json::json!({}));
621        assert_eq!(r.event_count(), 0);
622    }
623
624    #[test]
625    fn max_events_caps_buffer() {
626        let r = SessionRecorder::start(RecorderConfig {
627            output_dir: std::env::temp_dir(),
628            max_events: 3,
629            max_network_entries: 10,
630        });
631        for i in 0..10 {
632            r.record_event("Test.event", serde_json::json!({"i": i}));
633        }
634        assert_eq!(r.event_count(), 3);
635    }
636
637    #[test]
638    fn network_tracking_builds_entry() {
639        let r = test_recorder();
640
641        r.record_event(
642            "Network.requestWillBeSent",
643            serde_json::json!({
644                "requestId": "req-1",
645                "request": {
646                    "method": "GET",
647                    "url": "https://example.com/api?foo=bar",
648                    "headers": {"User-Agent": "test/1.0"}
649                }
650            }),
651        );
652
653        r.record_event(
654            "Network.responseReceived",
655            serde_json::json!({
656                "requestId": "req-1",
657                "response": {
658                    "status": 200,
659                    "statusText": "OK",
660                    "mimeType": "application/json",
661                    "headers": {"Content-Type": "application/json"}
662                }
663            }),
664        );
665
666        r.record_event(
667            "Network.loadingFinished",
668            serde_json::json!({
669                "requestId": "req-1",
670                "encodedDataLength": 512
671            }),
672        );
673
674        assert_eq!(r.network_entry_count(), 1);
675    }
676
677    #[test]
678    fn export_har_writes_valid_json() -> std::result::Result<(), Box<dyn std::error::Error>> {
679        let r = test_recorder();
680
681        // Send a complete network transaction
682        r.record_event(
683            "Network.requestWillBeSent",
684            serde_json::json!({
685                "requestId": "r1",
686                "request": {"method": "GET", "url": "https://example.com/", "headers": {}}
687            }),
688        );
689        r.record_event(
690            "Network.responseReceived",
691            serde_json::json!({
692                "requestId": "r1",
693                "response": {"status": 200, "statusText": "OK", "mimeType": "text/html", "headers": {}}
694            }),
695        );
696        r.record_event(
697            "Network.loadingFinished",
698            serde_json::json!({"requestId": "r1", "encodedDataLength": 1024}),
699        );
700
701        let path = std::env::temp_dir().join("stygian_test.har");
702        r.export_har(&path)?;
703
704        let contents = std::fs::read_to_string(&path)?;
705        let har: Har = serde_json::from_str(&contents)?;
706        assert_eq!(har.log.entries.len(), 1);
707        if let Some(entry) = har.log.entries.first() {
708            assert_eq!(entry.request.method, "GET");
709            assert_eq!(entry.response.status, 200);
710        }
711        let _ = std::fs::remove_file(&path);
712        Ok(())
713    }
714
715    #[test]
716    fn event_log_export_writes_ndjson() -> std::result::Result<(), Box<dyn std::error::Error>> {
717        let r = test_recorder();
718        r.record_event("A", serde_json::json!({"x": 1}));
719        r.record_event("B", serde_json::json!({"y": 2}));
720
721        let path = std::env::temp_dir().join("stygian_events.ndjson");
722        r.export_event_log(&path)?;
723
724        let contents = std::fs::read_to_string(&path)?;
725        assert_eq!(contents.lines().count(), 2);
726        let _ = std::fs::remove_file(&path);
727        Ok(())
728    }
729
730    #[test]
731    fn parse_query_string() {
732        let params = parse_query("https://example.com/path?a=1&b=hello%20world");
733        assert_eq!(params.len(), 2);
734        let names: Vec<_> = params.iter().map(|p| p.name.as_str()).collect();
735        assert!(names.contains(&"a"), "missing 'a'");
736        assert!(names.contains(&"b"), "missing 'b'");
737    }
738
739    #[test]
740    fn iso_timestamp_format() {
741        let ts = iso_timestamp();
742        // Should look like 2024-01-15T12:34:56.789Z
743        assert!(ts.ends_with('Z'), "should end with Z: {ts}");
744        assert_eq!(ts.len(), 24, "length should be 24: {ts}");
745    }
746}