stygian_graph/adapters/
browser.rs

1//! JavaScript rendering adapter using stygian-browser
2//!
3//! Implements the `ScrapingService` port using a headless browser (via the
4//! `stygian-browser` crate) for pages that require JavaScript execution.
5//!
6//! Features:
7//! - Full JS execution via Chrome DevTools Protocol
8//! - Configurable wait strategies (DOM ready, network idle, selector)
9//! - Stealth mode via stygian-browser's anti-detection features
10//! - Graceful fallback to HTTP when browser pool is unavailable
11//! - Circuit-breaker friendly: propagates pool-exhaustion as service errors
12//!
13//! # Example
14//!
15//! ```no_run
16//! use stygian_graph::adapters::browser::{BrowserAdapter, BrowserAdapterConfig};
17//! use stygian_graph::ports::{ScrapingService, ServiceInput};
18//! use serde_json::json;
19//!
20//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
21//! let config = BrowserAdapterConfig::default();
22//! let adapter = BrowserAdapter::with_config(config);
23//! let input = ServiceInput {
24//!     url: "https://example.com".to_string(),
25//!     params: json!({ "wait_strategy": "dom_content_loaded", "timeout_ms": 30000 }),
26//! };
27//! // let result = adapter.execute(input).await.unwrap();
28//! # });
29//! ```
30
31use std::fmt;
32use std::time::{Duration, Instant};
33
34use async_trait::async_trait;
35use serde_json::{Value, json};
36
37use crate::domain::error::{Result, ServiceError, StygianError};
38use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
39
40/// Wait strategy for JavaScript-rendered pages
41#[derive(Debug, Clone, PartialEq, Eq, Default)]
42pub enum WaitStrategy {
43    /// Wait until DOM content is loaded (default)
44    #[default]
45    DomContentLoaded,
46    /// Wait until all network requests complete
47    NetworkIdle,
48    /// Wait until a CSS selector appears in the DOM
49    SelectorAppears(String),
50    /// Wait for a fixed duration after navigation
51    Fixed(Duration),
52}
53
54impl WaitStrategy {
55    /// Parse from a JSON parameter value
56    fn from_params(params: &Value) -> Self {
57        match params.get("wait_strategy").and_then(Value::as_str) {
58            Some("network_idle") => Self::NetworkIdle,
59            Some("dom_content_loaded") => Self::DomContentLoaded,
60            Some(s) if s.starts_with("selector:") => {
61                Self::SelectorAppears(s.trim_start_matches("selector:").to_string())
62            }
63            _ => params
64                .get("wait_ms")
65                .and_then(Value::as_u64)
66                .map_or(Self::DomContentLoaded, |ms| {
67                    Self::Fixed(Duration::from_millis(ms))
68                }),
69        }
70    }
71}
72
73impl fmt::Display for WaitStrategy {
74    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
75        match self {
76            Self::DomContentLoaded => write!(f, "dom_content_loaded"),
77            Self::NetworkIdle => write!(f, "network_idle"),
78            Self::SelectorAppears(selector) => write!(f, "selector_appears({selector})"),
79            Self::Fixed(duration) => write!(f, "fixed_{}ms", duration.as_millis()),
80        }
81    }
82}
83
84/// Stealth level for browser automation
85#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
86pub enum StealthLevel {
87    /// No stealth (fastest, but detectable)
88    None,
89    /// Basic stealth: hide automation signals
90    #[default]
91    Basic,
92    /// Advanced stealth: full fingerprint spoofing
93    Advanced,
94}
95
96impl StealthLevel {
97    fn from_params(params: &Value) -> Self {
98        match params.get("stealth_level").and_then(Value::as_str) {
99            Some("advanced") => Self::Advanced,
100            Some("none") => Self::None,
101            _ => Self::Basic,
102        }
103    }
104
105    /// Convert stealth level to string representation
106    pub const fn as_str(&self) -> &'static str {
107        match self {
108            Self::None => "none",
109            Self::Basic => "basic",
110            Self::Advanced => "advanced",
111        }
112    }
113}
114
115/// Configuration for the `BrowserAdapter`
116#[derive(Debug, Clone)]
117pub struct BrowserAdapterConfig {
118    /// Default navigation timeout
119    pub timeout: Duration,
120    /// Maximum concurrent browser sessions (maps to pool size)
121    pub max_concurrent: usize,
122    /// Default wait strategy
123    pub default_wait: WaitStrategy,
124    /// Default stealth level
125    pub default_stealth: StealthLevel,
126    /// Whether to block common tracking/ad resources (improves speed)
127    pub block_resources: bool,
128    /// Whether to run in headless mode
129    pub headless: bool,
130    /// Custom User-Agent string (None = default)
131    pub user_agent: Option<String>,
132    /// Viewport width in pixels
133    pub viewport_width: u32,
134    /// Viewport height in pixels
135    pub viewport_height: u32,
136}
137
138impl Default for BrowserAdapterConfig {
139    fn default() -> Self {
140        Self {
141            timeout: Duration::from_secs(30),
142            max_concurrent: 5,
143            default_wait: WaitStrategy::DomContentLoaded,
144            default_stealth: StealthLevel::Basic,
145            block_resources: true,
146            headless: true,
147            user_agent: None,
148            viewport_width: 1920,
149            viewport_height: 1080,
150        }
151    }
152}
153
154/// Browser-based scraping adapter
155///
156/// Wraps stygian-browser's `BrowserPool` to implement the `ScrapingService` port.
157/// Falls back to an error indicating unavailability when the browser pool
158/// cannot be used (headless Chrome not available, pool exhausted, etc.).
159///
160/// The adapter accepts per-request parameters via `ServiceInput.params`:
161/// - `wait_strategy`: `"dom_content_loaded"` | `"network_idle"` | `"selector:<css>"` | `"fixed_ms:<n>"`
162/// - `stealth_level`: `"none"` | `"basic"` | `"advanced"`
163/// - `timeout_ms`: override default timeout in milliseconds
164/// - `wait_ms`: milliseconds to wait when strategy is "fixed"
165#[derive(Clone)]
166pub struct BrowserAdapter {
167    config: BrowserAdapterConfig,
168}
169
170impl BrowserAdapter {
171    /// Create a new `BrowserAdapter` with default configuration
172    ///
173    /// # Example
174    ///
175    /// ```
176    /// use stygian_graph::adapters::browser::BrowserAdapter;
177    /// use stygian_graph::ports::ScrapingService;
178    ///
179    /// let adapter = BrowserAdapter::new();
180    /// assert_eq!(adapter.name(), "browser");
181    /// ```
182    pub fn new() -> Self {
183        Self {
184            config: BrowserAdapterConfig::default(),
185        }
186    }
187
188    /// Create a new `BrowserAdapter` with custom configuration
189    ///
190    /// # Example
191    ///
192    /// ```
193    /// use stygian_graph::adapters::browser::{BrowserAdapter, BrowserAdapterConfig};
194    /// use std::time::Duration;
195    ///
196    /// let config = BrowserAdapterConfig {
197    ///     timeout: Duration::from_secs(60),
198    ///     block_resources: false,
199    ///     ..BrowserAdapterConfig::default()
200    /// };
201    /// let adapter = BrowserAdapter::with_config(config);
202    /// ```
203    pub const fn with_config(config: BrowserAdapterConfig) -> Self {
204        Self { config }
205    }
206
207    /// Extract per-request timeout from params, falling back to config default
208    fn resolve_timeout(&self, params: &Value) -> Duration {
209        params
210            .get("timeout_ms")
211            .and_then(Value::as_u64)
212            .map_or(self.config.timeout, Duration::from_millis)
213    }
214
215    /// Performs the browser navigation using stygian-browser's `BrowserPool`.
216    ///
217    /// Returns rendered HTML and timing metadata. When headless Chrome is
218    /// unavailable this returns a `ServiceError` so callers can react
219    /// (e.g. fall back to `HttpAdapter` via circuit-breaker logic).
220    #[allow(clippy::option_if_let_else)]
221    #[cfg(feature = "browser")]
222    async fn navigate_with_browser(
223        &self,
224        url: &str,
225        wait: &WaitStrategy,
226        timeout: Duration,
227    ) -> Result<(String, Value)> {
228        use stygian_browser::page::WaitUntil;
229        use stygian_browser::{BrowserConfig, BrowserPool};
230
231        let start = Instant::now();
232
233        // Step 1: Build browser config from adapter config
234        let browser_config = BrowserConfig {
235            headless: self.config.headless,
236            ..BrowserConfig::default()
237        };
238
239        // Step 2: Create pool (in production this would be cached at adapter level)
240        let pool = BrowserPool::new(browser_config)
241            .await
242            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
243
244        // Step 3: Acquire a browser handle with timeout
245        let handle = match tokio::time::timeout(timeout, pool.acquire()).await {
246            Ok(Ok(h)) => h,
247            Ok(Err(e)) => {
248                return Err(StygianError::Service(ServiceError::Unavailable(format!(
249                    "Browser pool exhausted or unavailable: {e}"
250                ))));
251            }
252            Err(_) => {
253                return Err(StygianError::Service(ServiceError::Unavailable(format!(
254                    "Browser acquisition timeout after {timeout:?}"
255                ))));
256            }
257        };
258
259        // Step 4: Get browser instance and create new page
260        let Some(instance) = handle.browser() else {
261            return Err(StygianError::Service(ServiceError::Unavailable(
262                "Failed to get browser instance after acquisition".to_string(),
263            )));
264        };
265
266        let mut page = instance
267            .new_page()
268            .await
269            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
270
271        // Step 5: Convert WaitStrategy to browser's WaitUntil
272        let wait_condition = match wait {
273            WaitStrategy::DomContentLoaded => WaitUntil::DomContentLoaded,
274            WaitStrategy::NetworkIdle => WaitUntil::NetworkIdle,
275            WaitStrategy::SelectorAppears(selector) => WaitUntil::Selector(selector.clone()),
276            WaitStrategy::Fixed(_duration) => WaitUntil::DomContentLoaded, // Fixed uses timeout, not condition
277        };
278
279        // Step 6: Navigate with specified wait strategy
280        if let Err(e) = page.navigate(url, wait_condition, timeout).await {
281            return Err(StygianError::Service(ServiceError::Unavailable(format!(
282                "Browser navigation failed: {e}"
283            ))));
284        }
285
286        // Step 7: Wait for fixed duration if specified
287        if let WaitStrategy::Fixed(duration) = wait {
288            tokio::time::sleep(*duration).await;
289        }
290
291        // Step 8: Get rendered HTML content
292        let html = page
293            .content()
294            .await
295            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
296
297        let elapsed = start.elapsed();
298
299        // Step 9: Return HTML and metadata
300        // BrowserHandle is automatically returned to pool when dropped
301        Ok((
302            html,
303            json!({
304                "url": url,
305                "navigation_time_ms": elapsed.as_millis(),
306                "wait_strategy": wait.to_string(),
307                "stealth_level": self.config.default_stealth.as_str(),
308                "viewport": {
309                    "width": self.config.viewport_width,
310                    "height": self.config.viewport_height
311                },
312                "rendered": true,
313            }),
314        ))
315    }
316
317    /// Fallback path when the `browser` feature is disabled
318    #[cfg(not(feature = "browser"))]
319    async fn navigate_with_browser(
320        &self,
321        url: &str,
322        _wait: &WaitStrategy,
323        _timeout: Duration,
324    ) -> Result<(String, Value)> {
325        Err(StygianError::Service(ServiceError::Unavailable(format!(
326            "stygian-graph was compiled without the 'browser' feature; \
327             cannot render JavaScript for URL: {url}"
328        ))))
329    }
330}
331
332impl Default for BrowserAdapter {
333    fn default() -> Self {
334        Self::new()
335    }
336}
337
338#[async_trait]
339impl ScrapingService for BrowserAdapter {
340    /// Execute a JavaScript-rendered scrape
341    ///
342    /// Accepts the following `params` keys:
343    /// - `wait_strategy` — how to determine page readiness
344    /// - `stealth_level` — anti-detection level  
345    /// - `timeout_ms` — per-request timeout override
346    ///
347    /// # Example
348    ///
349    /// ```no_run
350    /// use stygian_graph::adapters::browser::BrowserAdapter;
351    /// use stygian_graph::ports::{ScrapingService, ServiceInput};
352    /// use serde_json::json;
353    ///
354    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
355    /// let adapter = BrowserAdapter::new();
356    /// let input = ServiceInput {
357    ///     url: "https://example.com".to_string(),
358    ///     params: json!({ "wait_strategy": "network_idle", "stealth_level": "advanced" }),
359    /// };
360    /// // let output = adapter.execute(input).await.unwrap();
361    /// # });
362    /// ```
363    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
364        let wait = WaitStrategy::from_params(&input.params);
365        let _stealth = StealthLevel::from_params(&input.params);
366        let timeout = self.resolve_timeout(&input.params);
367
368        let (html, metadata) = tokio::time::timeout(
369            timeout + Duration::from_secs(5), // outer hard deadline
370            self.navigate_with_browser(&input.url, &wait, timeout),
371        )
372        .await
373        .map_err(|_| {
374            StygianError::Service(ServiceError::Timeout(
375                u64::try_from(timeout.as_millis()).unwrap_or(u64::MAX),
376            ))
377        })??;
378
379        Ok(ServiceOutput {
380            data: html,
381            metadata,
382        })
383    }
384
385    fn name(&self) -> &'static str {
386        "browser"
387    }
388}
389
390#[cfg(test)]
391#[allow(
392    clippy::unwrap_used,
393    clippy::expect_used,
394    clippy::panic,
395    clippy::redundant_closure_for_method_calls
396)]
397mod tests {
398    use super::*;
399
400    #[test]
401    fn test_adapter_default_name() {
402        let adapter = BrowserAdapter::new();
403        assert_eq!(adapter.name(), "browser");
404    }
405
406    #[test]
407    fn test_wait_strategy_from_params_dom() {
408        let params = json!({ "wait_strategy": "dom_content_loaded" });
409        assert_eq!(
410            WaitStrategy::from_params(&params),
411            WaitStrategy::DomContentLoaded
412        );
413    }
414
415    #[test]
416    fn test_wait_strategy_from_params_network_idle() {
417        let params = json!({ "wait_strategy": "network_idle" });
418        assert_eq!(
419            WaitStrategy::from_params(&params),
420            WaitStrategy::NetworkIdle
421        );
422    }
423
424    #[test]
425    fn test_wait_strategy_from_params_selector() {
426        let params = json!({ "wait_strategy": "selector:#main-content" });
427        assert_eq!(
428            WaitStrategy::from_params(&params),
429            WaitStrategy::SelectorAppears("#main-content".to_string())
430        );
431    }
432
433    #[test]
434    fn test_wait_strategy_from_params_fixed_ms() {
435        let params = json!({ "wait_ms": 500u64 });
436        assert_eq!(
437            WaitStrategy::from_params(&params),
438            WaitStrategy::Fixed(Duration::from_millis(500))
439        );
440    }
441
442    #[test]
443    fn test_stealth_level_from_params() {
444        assert_eq!(
445            StealthLevel::from_params(&json!({ "stealth_level": "advanced" })),
446            StealthLevel::Advanced
447        );
448        assert_eq!(
449            StealthLevel::from_params(&json!({ "stealth_level": "none" })),
450            StealthLevel::None
451        );
452        assert_eq!(StealthLevel::from_params(&json!({})), StealthLevel::Basic);
453    }
454
455    #[test]
456    fn test_resolve_timeout_override() {
457        let adapter = BrowserAdapter::new();
458        let params = json!({ "timeout_ms": 5000u64 });
459        assert_eq!(
460            adapter.resolve_timeout(&params),
461            Duration::from_millis(5000)
462        );
463    }
464
465    #[test]
466    fn test_resolve_timeout_default() {
467        let adapter = BrowserAdapter::new();
468        let params = json!({});
469        assert_eq!(adapter.resolve_timeout(&params), Duration::from_secs(30));
470    }
471
472    #[test]
473    fn test_config_builder() {
474        let config = BrowserAdapterConfig {
475            timeout: Duration::from_secs(60),
476            max_concurrent: 3,
477            block_resources: false,
478            ..BrowserAdapterConfig::default()
479        };
480        let adapter = BrowserAdapter::with_config(config);
481        assert_eq!(adapter.config.timeout, Duration::from_secs(60));
482        assert_eq!(adapter.config.max_concurrent, 3);
483    }
484
485    #[allow(clippy::panic)]
486    #[tokio::test]
487    #[ignore = "requires real Chrome binary"]
488    async fn test_execute_returns_service_output_or_unavailable() {
489        let adapter = BrowserAdapter::new();
490        let input = ServiceInput {
491            url: "https://example.com".to_string(),
492            params: json!({ "wait_strategy": "dom_content_loaded" }),
493        };
494        // Either succeeds (pool stub) or returns Unavailable — both are acceptable
495        match adapter.execute(input).await {
496            Ok(output) => {
497                assert!(!output.data.is_empty(), "output data should not be empty");
498                assert!(output.metadata.is_object());
499            }
500            Err(StygianError::Service(ServiceError::Unavailable(_))) => {
501                // expected when headless Chrome is not available
502            }
503            Err(e) => panic!("unexpected error: {e}"),
504        }
505    }
506
507    // Integration tests from T00 Task Requirements
508
509    #[tokio::test]
510    #[ignore = "requires real Chrome binary and external network access"]
511    async fn browser_adapter_navigates_url() {
512        let config = BrowserAdapterConfig::default();
513        let adapter = BrowserAdapter::with_config(config);
514
515        let input = ServiceInput {
516            url: "https://example.com".to_string(),
517            params: json!({
518                "wait_strategy": "dom_content_loaded",
519                "timeout_ms": 30000
520            }),
521        };
522
523        let result = adapter.execute(input).await;
524
525        // Should succeed or return graceful unavailable (browser not installed)
526        match result {
527            Ok(output) => {
528                assert!(!output.data.is_empty());
529                assert!(
530                    output
531                        .metadata
532                        .get("rendered")
533                        .and_then(|v| v.as_bool())
534                        .unwrap_or(false)
535                );
536                assert!(output.metadata.get("navigation_time_ms").is_some());
537                assert_eq!(
538                    output.metadata.get("url").and_then(|v| v.as_str()),
539                    Some("https://example.com")
540                );
541            }
542            Err(StygianError::Service(ServiceError::Unavailable(_))) => {
543                // Expected if Chrome not installed
544            }
545            Err(e) => panic!("Unexpected error: {e}"),
546        }
547    }
548
549    #[tokio::test]
550    #[ignore = "Requires Chrome installed and network access; may panic if browser unavailable"]
551    async fn browser_adapter_respects_timeout() {
552        let config = BrowserAdapterConfig {
553            timeout: Duration::from_secs(2),
554            ..Default::default()
555        };
556        let adapter = BrowserAdapter::with_config(config);
557
558        // This URL delays for 10 seconds, should timeout with 2s limit
559        let input = ServiceInput {
560            url: "https://httpbin.org/delay/10".to_string(),
561            params: json!({"timeout_ms": 2000}),
562        };
563
564        let result = adapter.execute(input).await;
565
566        // Should timeout gracefully or be unavailable (Chrome not installed)
567        match result {
568            Err(StygianError::Service(ServiceError::Unavailable(msg))) => {
569                // Expected if Chrome not installed or timeout occurred
570                assert!(
571                    msg.contains("timeout")
572                        || msg.contains("unavailable")
573                        || msg.contains("Chrome")
574                        || msg.contains("exhausted")
575                );
576            }
577            Err(StygianError::Service(ServiceError::Timeout(_))) => {
578                // Also acceptable - explicit timeout
579            }
580            Ok(_) => {
581                // Should not succeed with 2s timeout on 10s delay
582                panic!("Expected timeout or unavailable, got success");
583            }
584            Err(e) => {
585                // Any other error is acceptable (network, browser init, etc)
586                eprintln!("Got acceptable error: {e}");
587            }
588        }
589    }
590
591    #[tokio::test]
592    #[ignore = "requires real Chrome binary"]
593    async fn browser_adapter_invalid_url() {
594        let config = BrowserAdapterConfig::default();
595        let adapter = BrowserAdapter::with_config(config);
596
597        let input = ServiceInput {
598            url: "not-a-valid-url".to_string(),
599            params: json!({}),
600        };
601
602        let result = adapter.execute(input).await;
603
604        // Should surface browser error gracefully
605        assert!(result.is_err());
606    }
607
608    #[tokio::test]
609    #[ignore = "requires real Chrome binary and external network access"]
610    async fn browser_adapter_wait_strategy_selector() {
611        let config = BrowserAdapterConfig::default();
612        let adapter = BrowserAdapter::with_config(config);
613
614        let input = ServiceInput {
615            url: "https://example.com".to_string(),
616            params: json!({
617                "wait_strategy": "selector:body"
618            }),
619        };
620
621        match adapter.execute(input).await {
622            Ok(output) => {
623                assert_eq!(
624                    output
625                        .metadata
626                        .get("wait_strategy")
627                        .and_then(|v| v.as_str()),
628                    Some("selector_appears(body)")
629                );
630            }
631            Err(StygianError::Service(ServiceError::Unavailable(_))) => {
632                // Expected if Chrome not installed
633            }
634            Err(e) => panic!("Unexpected error: {e}"),
635        }
636    }
637
638    #[tokio::test]
639    #[ignore = "requires real Chrome binary and external network access"]
640    async fn browser_adapter_metadata_complete() {
641        let config = BrowserAdapterConfig {
642            default_stealth: StealthLevel::Advanced,
643            user_agent: Some("Mozilla/5.0".to_string()),
644            viewport_width: 1440,
645            viewport_height: 900,
646            ..Default::default()
647        };
648        let adapter = BrowserAdapter::with_config(config);
649
650        let input = ServiceInput {
651            url: "https://example.com".to_string(),
652            params: json!({}),
653        };
654
655        match adapter.execute(input).await {
656            Ok(output) => {
657                assert_eq!(
658                    output.metadata.get("url").and_then(|v| v.as_str()),
659                    Some("https://example.com")
660                );
661                assert_eq!(
662                    output
663                        .metadata
664                        .get("stealth_level")
665                        .and_then(|v| v.as_str()),
666                    Some("advanced")
667                );
668                assert!(output.metadata.get("viewport").is_some());
669                assert!(output.metadata.get("navigation_time_ms").is_some());
670                let viewport = output.metadata.get("viewport").expect("viewport exists");
671                assert_eq!(viewport.get("width").and_then(|v| v.as_u64()), Some(1440));
672                assert_eq!(viewport.get("height").and_then(|v| v.as_u64()), Some(900));
673            }
674            Err(StygianError::Service(ServiceError::Unavailable(_))) => {
675                // Expected if Chrome not installed
676            }
677            Err(e) => panic!("Unexpected error: {e}"),
678        }
679    }
680}