Skip to main content

stygian_browser/
acquisition.rs

1//! Opinionated acquisition runner with deterministic escalation.
2//!
3//! The runner executes a mode-specific strategy ladder and returns a terminal
4//! [`AcquisitionResult`] for every request, including setup-failure and timeout
5//! paths.
6
7use std::sync::Arc;
8use std::time::{Duration, Instant};
9
10#[cfg(feature = "browserbase")]
11use chromiumoxide::Browser;
12#[cfg(feature = "browserbase")]
13use futures::StreamExt;
14use serde::{Deserialize, Serialize};
15use serde_json::Value;
16#[cfg(feature = "browserbase")]
17use tokio::time::timeout;
18
19use crate::BrowserPool;
20use crate::error::BrowserError;
21use crate::page::WaitUntil;
22
23/// Opinionated acquisition mode for the escalation ladder.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum AcquisitionMode {
27    /// Prioritize lowest-latency paths.
28    Fast,
29    /// Favor reliability with broader escalation.
30    Resilient,
31    /// Start from stronger anti-bot paths.
32    Hostile,
33    /// Enter from a policy-guided start point.
34    Investigate,
35}
36
37/// Strategy stage attempted by the acquisition runner.
38#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
39#[serde(rename_all = "snake_case")]
40pub enum StrategyUsed {
41    /// Plain HTTP fetch.
42    DirectHttp,
43    /// HTTP fetch using a TLS-profiled client.
44    TlsProfiledHttp,
45    /// Browser session with opinionated light-stealth defaults.
46    BrowserLightStealth,
47    /// Browser session scoped to a sticky context id.
48    StickyProxyBrowserSession,
49    /// Managed remote browser session routed through Browserbase.
50    #[cfg(feature = "browserbase")]
51    BrowserbaseManagedSession,
52    /// Policy-guided entry marker for investigation mode.
53    InvestigateEntry,
54}
55
56/// One acquisition request.
57#[derive(Debug, Clone)]
58pub struct AcquisitionRequest {
59    /// Target URL.
60    pub url: String,
61    /// Acquisition mode.
62    pub mode: AcquisitionMode,
63    /// Optional selector that must be present for browser-stage success.
64    pub wait_for_selector: Option<String>,
65    /// Optional JavaScript extraction expression evaluated in browser stages.
66    pub extraction_js: Option<String>,
67    /// Hard wall-clock timeout for the whole acquisition attempt.
68    pub total_timeout: Duration,
69    /// Per-navigation timeout for browser stages.
70    pub navigation_timeout: Duration,
71    /// Per-request timeout for HTTP stages.
72    pub request_timeout: Duration,
73    /// Maximum HTML bytes captured into `html_excerpt`.
74    pub html_excerpt_bytes: usize,
75    /// Optional policy-guided stage that `Investigate` mode starts from.
76    pub investigate_start: Option<StrategyUsed>,
77    /// Opt into the optional Browserbase-managed stage when available.
78    pub browserbase_enabled: bool,
79}
80
81impl Default for AcquisitionRequest {
82    fn default() -> Self {
83        Self {
84            url: String::new(),
85            mode: AcquisitionMode::Resilient,
86            wait_for_selector: None,
87            extraction_js: None,
88            total_timeout: Duration::from_secs(45),
89            navigation_timeout: Duration::from_secs(30),
90            request_timeout: Duration::from_secs(15),
91            html_excerpt_bytes: 4_096,
92            investigate_start: None,
93            browserbase_enabled: false,
94        }
95    }
96}
97
98/// Failure class recorded per strategy stage.
99#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
100#[serde(rename_all = "snake_case")]
101pub enum StageFailureKind {
102    /// Stage initialization/setup failed.
103    Setup,
104    /// Stage hit a timeout.
105    Timeout,
106    /// Stage reached a known anti-bot block class.
107    Blocked,
108    /// Transport/runtime failure.
109    Transport,
110    /// Extraction/validation failure.
111    Extraction,
112}
113
114/// Captured failure record for one stage.
115#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
116pub struct StageFailure {
117    /// Stage where the failure happened.
118    pub strategy: StrategyUsed,
119    /// Coarse failure kind.
120    pub kind: StageFailureKind,
121    /// Compact diagnostic message.
122    pub message: String,
123}
124
125/// Terminal acquisition result.
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct AcquisitionResult {
128    /// `true` when any stage satisfied success criteria.
129    pub success: bool,
130    /// Stage that produced the terminal success, if any.
131    pub strategy_used: Option<StrategyUsed>,
132    /// Ordered stage attempts.
133    pub attempted: Vec<StrategyUsed>,
134    /// Final URL observed from the successful stage.
135    pub final_url: Option<String>,
136    /// HTTP status code observed from the successful stage.
137    pub status_code: Option<u16>,
138    /// Best-effort HTML excerpt from the successful stage.
139    pub html_excerpt: Option<String>,
140    /// Optional extraction payload.
141    pub extracted: Option<Value>,
142    /// Failure bundle collected across stages.
143    pub failures: Vec<StageFailure>,
144    /// `true` when the wall-clock timeout fired before completion.
145    pub timed_out: bool,
146}
147
148impl AcquisitionResult {
149    const fn empty() -> Self {
150        Self {
151            success: false,
152            strategy_used: None,
153            attempted: Vec::new(),
154            final_url: None,
155            status_code: None,
156            html_excerpt: None,
157            extracted: None,
158            failures: Vec::new(),
159            timed_out: false,
160        }
161    }
162}
163
164#[derive(Debug, Clone)]
165struct StageSuccess {
166    final_url: Option<String>,
167    status_code: Option<u16>,
168    html_excerpt: Option<String>,
169    extracted: Option<Value>,
170}
171
172#[derive(Debug, Clone)]
173enum StageOutcome {
174    Marker,
175    Success(StageSuccess),
176    Failure(StageFailure),
177}
178
179/// Runner facade for opinionated acquisition.
180#[derive(Clone)]
181pub struct AcquisitionRunner {
182    pool: Arc<BrowserPool>,
183}
184
185impl AcquisitionRunner {
186    /// Create a new acquisition runner.
187    ///
188    /// # Example
189    ///
190    /// ```no_run
191    /// use stygian_browser::{AcquisitionRunner, BrowserConfig, BrowserPool};
192    ///
193    /// # async fn run() -> stygian_browser::Result<()> {
194    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
195    /// let _runner = AcquisitionRunner::new(pool);
196    /// # Ok(())
197    /// # }
198    /// ```
199    #[must_use]
200    pub const fn new(pool: Arc<BrowserPool>) -> Self {
201        Self { pool }
202    }
203
204    /// Return the deterministic stage ladder for a mode.
205    ///
206    /// Investigation mode starts at `investigate_start` when provided.
207    #[must_use]
208    pub fn strategy_ladder(
209        mode: AcquisitionMode,
210        investigate_start: Option<StrategyUsed>,
211    ) -> Vec<StrategyUsed> {
212        let mut stages = match mode {
213            AcquisitionMode::Fast => vec![
214                StrategyUsed::DirectHttp,
215                StrategyUsed::TlsProfiledHttp,
216                StrategyUsed::BrowserLightStealth,
217            ],
218            AcquisitionMode::Resilient => vec![
219                StrategyUsed::DirectHttp,
220                StrategyUsed::TlsProfiledHttp,
221                StrategyUsed::BrowserLightStealth,
222                StrategyUsed::StickyProxyBrowserSession,
223            ],
224            AcquisitionMode::Hostile => vec![
225                StrategyUsed::BrowserLightStealth,
226                StrategyUsed::StickyProxyBrowserSession,
227                StrategyUsed::TlsProfiledHttp,
228                StrategyUsed::DirectHttp,
229            ],
230            AcquisitionMode::Investigate => {
231                let start = investigate_start.unwrap_or(StrategyUsed::BrowserLightStealth);
232                vec![
233                    StrategyUsed::InvestigateEntry,
234                    start,
235                    StrategyUsed::StickyProxyBrowserSession,
236                    StrategyUsed::TlsProfiledHttp,
237                ]
238            }
239        };
240
241        dedupe_preserve_order(&mut stages);
242        stages
243    }
244
245    /// Execute the acquisition ladder and return a terminal result.
246    ///
247    /// This method never panics and always returns an [`AcquisitionResult`],
248    /// including timeout and setup-failure paths.
249    ///
250    /// # Example
251    ///
252    /// ```no_run
253    /// use stygian_browser::{AcquisitionMode, AcquisitionRequest, AcquisitionRunner, BrowserConfig, BrowserPool};
254    ///
255    /// # async fn run() -> stygian_browser::Result<()> {
256    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
257    /// let runner = AcquisitionRunner::new(pool);
258    /// let request = AcquisitionRequest {
259    ///     url: "https://example.com".to_string(),
260    ///     mode: AcquisitionMode::Resilient,
261    ///     ..AcquisitionRequest::default()
262    /// };
263    /// let _result = runner.run(request).await;
264    /// # Ok(())
265    /// # }
266    /// ```
267    pub async fn run(&self, request: AcquisitionRequest) -> AcquisitionResult {
268        let timeout = request.total_timeout;
269        let timeout_strategy = Self::strategy_ladder(request.mode, request.investigate_start)
270            .into_iter()
271            .find(|strategy| *strategy != StrategyUsed::InvestigateEntry)
272            .unwrap_or(StrategyUsed::DirectHttp);
273        let mut result = tokio::time::timeout(timeout, self.run_inner(&request))
274            .await
275            .unwrap_or_else(|_| {
276                let mut timed_out = AcquisitionResult::empty();
277                timed_out.timed_out = true;
278                timed_out.failures.push(StageFailure {
279                    strategy: timeout_strategy,
280                    kind: StageFailureKind::Timeout,
281                    message: format!("acquisition timed out after {}ms", timeout.as_millis()),
282                });
283                timed_out
284            });
285
286        if !result.success {
287            // Guarantee deterministic terminal output for all unsuccessful runs.
288            if result.failures.is_empty() {
289                result.failures.push(StageFailure {
290                    strategy: timeout_strategy,
291                    kind: StageFailureKind::Transport,
292                    message: "acquisition ended without stage output".to_string(),
293                });
294            }
295        }
296
297        result
298    }
299
300    async fn run_inner(&self, request: &AcquisitionRequest) -> AcquisitionResult {
301        let mut result = AcquisitionResult::empty();
302
303        #[cfg(feature = "browserbase")]
304        let mut ladder = Self::strategy_ladder(request.mode, request.investigate_start);
305
306        #[cfg(not(feature = "browserbase"))]
307        let ladder = Self::strategy_ladder(request.mode, request.investigate_start);
308
309        #[cfg(feature = "browserbase")]
310        {
311            maybe_insert_browserbase_stage(&mut ladder, request.browserbase_enabled);
312        }
313        let started = Instant::now();
314
315        for strategy in ladder {
316            if started.elapsed() >= request.total_timeout {
317                result.timed_out = true;
318                result.failures.push(StageFailure {
319                    strategy,
320                    kind: StageFailureKind::Timeout,
321                    message: "wall-clock timeout reached before stage execution".to_string(),
322                });
323                break;
324            }
325
326            result.attempted.push(strategy);
327            match self.execute_stage(strategy, request).await {
328                StageOutcome::Marker => {}
329                StageOutcome::Success(success) => {
330                    result.success = true;
331                    result.strategy_used = Some(strategy);
332                    result.final_url = success.final_url;
333                    result.status_code = success.status_code;
334                    result.html_excerpt = success.html_excerpt;
335                    result.extracted = success.extracted;
336                    break;
337                }
338                StageOutcome::Failure(failure) => result.failures.push(failure),
339            }
340        }
341
342        result
343    }
344
345    async fn execute_stage(
346        &self,
347        strategy: StrategyUsed,
348        request: &AcquisitionRequest,
349    ) -> StageOutcome {
350        match strategy {
351            StrategyUsed::DirectHttp => {
352                #[cfg(feature = "tls-config")]
353                {
354                    self.run_http_stage(request, false).await
355                }
356
357                #[cfg(not(feature = "tls-config"))]
358                {
359                    self.run_http_stage(request, false)
360                }
361            }
362            StrategyUsed::TlsProfiledHttp => {
363                #[cfg(feature = "tls-config")]
364                {
365                    self.run_http_stage(request, true).await
366                }
367
368                #[cfg(not(feature = "tls-config"))]
369                {
370                    self.run_http_stage(request, true)
371                }
372            }
373            StrategyUsed::BrowserLightStealth => self.run_browser_stage(request, false).await,
374            StrategyUsed::StickyProxyBrowserSession => self.run_browser_stage(request, true).await,
375            #[cfg(feature = "browserbase")]
376            StrategyUsed::BrowserbaseManagedSession => Self::run_browserbase_stage(request).await,
377            StrategyUsed::InvestigateEntry => StageOutcome::Marker,
378        }
379    }
380
381    #[cfg(feature = "browserbase")]
382    #[allow(clippy::too_many_lines)]
383    async fn run_browserbase_stage(request: &AcquisitionRequest) -> StageOutcome {
384        if !request.browserbase_enabled {
385            return StageOutcome::Failure(StageFailure {
386                strategy: StrategyUsed::BrowserbaseManagedSession,
387                kind: StageFailureKind::Setup,
388                message: "browserbase stage disabled for this request".to_string(),
389            });
390        }
391
392        let api_key = match std::env::var("BROWSERBASE_API_KEY") {
393            Ok(value) if !value.trim().is_empty() => value,
394            _ => {
395                return StageOutcome::Failure(StageFailure {
396                    strategy: StrategyUsed::BrowserbaseManagedSession,
397                    kind: StageFailureKind::Setup,
398                    message: "browserbase requires BROWSERBASE_API_KEY".to_string(),
399                });
400            }
401        };
402
403        let project_id = match std::env::var("BROWSERBASE_PROJECT_ID") {
404            Ok(value) if !value.trim().is_empty() => value,
405            _ => {
406                return StageOutcome::Failure(StageFailure {
407                    strategy: StrategyUsed::BrowserbaseManagedSession,
408                    kind: StageFailureKind::Setup,
409                    message: "browserbase requires BROWSERBASE_PROJECT_ID".to_string(),
410                });
411            }
412        };
413
414        let session = match create_browserbase_session(request, &api_key, &project_id).await {
415            Ok(session) => session,
416            Err(err) => {
417                return StageOutcome::Failure(StageFailure {
418                    strategy: StrategyUsed::BrowserbaseManagedSession,
419                    kind: classify_browser_error(&err),
420                    message: err.to_string(),
421                });
422            }
423        };
424
425        let connect_timeout = request.request_timeout.min(request.total_timeout);
426        let (mut browser, mut handler) = match timeout(
427            connect_timeout,
428            Browser::connect(session.connect_url.clone()),
429        )
430        .await
431        {
432            Ok(Ok(pair)) => pair,
433            Ok(Err(err)) => {
434                let _ = delete_browserbase_session(request, &api_key, &session.id).await;
435                return StageOutcome::Failure(StageFailure {
436                    strategy: StrategyUsed::BrowserbaseManagedSession,
437                    kind: StageFailureKind::Transport,
438                    message: format!("browserbase connect failed: {err}"),
439                });
440            }
441            Err(_) => {
442                let _ = delete_browserbase_session(request, &api_key, &session.id).await;
443                return StageOutcome::Failure(StageFailure {
444                    strategy: StrategyUsed::BrowserbaseManagedSession,
445                    kind: StageFailureKind::Timeout,
446                    message: format!(
447                        "browserbase connect timed out after {}ms",
448                        connect_timeout.as_millis()
449                    ),
450                });
451            }
452        };
453
454        let handler_task = tokio::spawn(async move {
455            while let Some(event) = handler.next().await {
456                if let Err(error) = event {
457                    tracing::warn!(%error, "browserbase handler error");
458                    break;
459                }
460            }
461        });
462
463        let run_result =
464            async {
465                let raw_page = browser.new_page("about:blank").await.map_err(|err| {
466                    BrowserError::CdpError {
467                        operation: "Browser.newPage".to_string(),
468                        message: err.to_string(),
469                    }
470                })?;
471
472                let mut page = crate::page::PageHandle::new(raw_page, request.navigation_timeout);
473
474                page.navigate(
475                    &request.url,
476                    WaitUntil::DomContentLoaded,
477                    request.navigation_timeout,
478                )
479                .await?;
480
481                if let Some(selector) = &request.wait_for_selector {
482                    page.wait_for_selector(selector, request.navigation_timeout)
483                        .await?;
484                }
485
486                let extracted = match request.extraction_js.as_deref() {
487                    Some(script) => Some(page.eval::<Value>(script).await.map_err(|err| {
488                        BrowserError::ScriptExecutionFailed {
489                            script: script.to_string(),
490                            reason: err.to_string(),
491                        }
492                    })?),
493                    None => None,
494                };
495
496                let html = page.content().await?;
497                let final_url = page.url().await.ok();
498                let status_code = page.status_code().ok().flatten();
499
500                Ok::<StageSuccess, BrowserError>(StageSuccess {
501                    final_url,
502                    status_code,
503                    html_excerpt: Some(truncate_html(&html, request.html_excerpt_bytes)),
504                    extracted,
505                })
506            }
507            .await;
508
509        let _ = timeout(Duration::from_secs(5), browser.close()).await;
510        handler_task.abort();
511        let _ = delete_browserbase_session(request, &api_key, &session.id).await;
512
513        match run_result {
514            Ok(success) => {
515                if is_block_status(success.status_code) {
516                    StageOutcome::Failure(StageFailure {
517                        strategy: StrategyUsed::BrowserbaseManagedSession,
518                        kind: StageFailureKind::Blocked,
519                        message: format!(
520                            "blocked status during browserbase stage: {:?}",
521                            success.status_code
522                        ),
523                    })
524                } else {
525                    StageOutcome::Success(success)
526                }
527            }
528            Err(err) => StageOutcome::Failure(StageFailure {
529                strategy: StrategyUsed::BrowserbaseManagedSession,
530                kind: classify_browser_error(&err),
531                message: err.to_string(),
532            }),
533        }
534    }
535
536    async fn run_browser_stage(&self, request: &AcquisitionRequest, sticky: bool) -> StageOutcome {
537        let strategy = if sticky {
538            StrategyUsed::StickyProxyBrowserSession
539        } else {
540            StrategyUsed::BrowserLightStealth
541        };
542
543        let handle_result = if sticky {
544            let context = host_hint(&request.url).unwrap_or_else(|| "default".to_string());
545            self.pool.acquire_for(&context).await
546        } else {
547            self.pool.acquire().await
548        };
549
550        let handle = match handle_result {
551            Ok(handle) => handle,
552            Err(err) => {
553                return StageOutcome::Failure(StageFailure {
554                    strategy,
555                    kind: StageFailureKind::Setup,
556                    message: format!("browser acquire failed: {err}"),
557                });
558            }
559        };
560
561        let page_result = async {
562            let browser = handle.browser().ok_or_else(|| {
563                BrowserError::ConfigError("browser handle already released".to_string())
564            })?;
565            let mut page = browser.new_page().await?;
566            page.navigate(
567                &request.url,
568                WaitUntil::DomContentLoaded,
569                request.navigation_timeout,
570            )
571            .await?;
572
573            if let Some(selector) = &request.wait_for_selector {
574                page.wait_for_selector(selector, request.navigation_timeout)
575                    .await?;
576            }
577
578            let extracted = match request.extraction_js.as_deref() {
579                Some(script) => Some(page.eval::<Value>(script).await.map_err(|err| {
580                    BrowserError::ScriptExecutionFailed {
581                        script: script.to_string(),
582                        reason: err.to_string(),
583                    }
584                })?),
585                None => None,
586            };
587
588            let html = page.content().await?;
589            let final_url = page.url().await.ok();
590            let status_code = page.status_code().ok().flatten();
591            let html_excerpt = truncate_html(&html, request.html_excerpt_bytes);
592
593            drop(page);
594
595            Ok::<StageSuccess, BrowserError>(StageSuccess {
596                final_url,
597                status_code,
598                html_excerpt: Some(html_excerpt),
599                extracted,
600            })
601        }
602        .await;
603
604        handle.release().await;
605
606        match page_result {
607            Ok(success) => {
608                if is_block_status(success.status_code) {
609                    StageOutcome::Failure(StageFailure {
610                        strategy,
611                        kind: StageFailureKind::Blocked,
612                        message: format!(
613                            "blocked status during browser stage: {:?}",
614                            success.status_code
615                        ),
616                    })
617                } else {
618                    StageOutcome::Success(success)
619                }
620            }
621            Err(err) => StageOutcome::Failure(StageFailure {
622                strategy,
623                kind: classify_browser_error(&err),
624                message: err.to_string(),
625            }),
626        }
627    }
628
629    #[cfg(feature = "tls-config")]
630    async fn run_http_stage(
631        &self,
632        request: &AcquisitionRequest,
633        tls_profiled: bool,
634    ) -> StageOutcome {
635        if request.wait_for_selector.is_some() || request.extraction_js.is_some() {
636            return StageOutcome::Failure(StageFailure {
637                strategy: if tls_profiled {
638                    StrategyUsed::TlsProfiledHttp
639                } else {
640                    StrategyUsed::DirectHttp
641                },
642                kind: StageFailureKind::Extraction,
643                message: "HTTP stages cannot satisfy selector/extraction requirements".to_string(),
644            });
645        }
646
647        self.run_http_stage_impl(request, tls_profiled).await
648    }
649
650    #[cfg(not(feature = "tls-config"))]
651    fn run_http_stage(&self, request: &AcquisitionRequest, tls_profiled: bool) -> StageOutcome {
652        if request.wait_for_selector.is_some() || request.extraction_js.is_some() {
653            return StageOutcome::Failure(StageFailure {
654                strategy: if tls_profiled {
655                    StrategyUsed::TlsProfiledHttp
656                } else {
657                    StrategyUsed::DirectHttp
658                },
659                kind: StageFailureKind::Extraction,
660                message: "HTTP stages cannot satisfy selector/extraction requirements".to_string(),
661            });
662        }
663
664        self.run_http_stage_impl(request, tls_profiled)
665    }
666
667    #[cfg(feature = "tls-config")]
668    async fn run_http_stage_impl(
669        &self,
670        request: &AcquisitionRequest,
671        tls_profiled: bool,
672    ) -> StageOutcome {
673        use crate::tls::{CHROME_131, build_profiled_client_preset};
674
675        let strategy = if tls_profiled {
676            StrategyUsed::TlsProfiledHttp
677        } else {
678            StrategyUsed::DirectHttp
679        };
680
681        let client = if tls_profiled {
682            match build_profiled_client_preset(&CHROME_131, None) {
683                Ok(client) => client,
684                Err(err) => {
685                    return StageOutcome::Failure(StageFailure {
686                        strategy,
687                        kind: StageFailureKind::Setup,
688                        message: format!("tls-profiled client setup failed: {err}"),
689                    });
690                }
691            }
692        } else {
693            match reqwest::Client::builder()
694                .timeout(request.request_timeout)
695                .cookie_store(true)
696                .build()
697            {
698                Ok(client) => client,
699                Err(err) => {
700                    return StageOutcome::Failure(StageFailure {
701                        strategy,
702                        kind: StageFailureKind::Setup,
703                        message: format!("http client setup failed: {err}"),
704                    });
705                }
706            }
707        };
708
709        let response = match client
710            .get(&request.url)
711            .timeout(request.request_timeout)
712            .send()
713            .await
714        {
715            Ok(response) => response,
716            Err(err) => {
717                return StageOutcome::Failure(StageFailure {
718                    strategy,
719                    kind: if err.is_timeout() {
720                        StageFailureKind::Timeout
721                    } else {
722                        StageFailureKind::Transport
723                    },
724                    message: err.to_string(),
725                });
726            }
727        };
728
729        let status_code = Some(response.status().as_u16());
730        let final_url = Some(response.url().to_string());
731        let html = match response.text().await {
732            Ok(text) => text,
733            Err(err) => {
734                return StageOutcome::Failure(StageFailure {
735                    strategy,
736                    kind: StageFailureKind::Transport,
737                    message: format!("response body read failed: {err}"),
738                });
739            }
740        };
741
742        if is_block_status(status_code) {
743            return StageOutcome::Failure(StageFailure {
744                strategy,
745                kind: StageFailureKind::Blocked,
746                message: format!("blocked status from HTTP stage: {status_code:?}"),
747            });
748        }
749
750        StageOutcome::Success(StageSuccess {
751            final_url,
752            status_code,
753            html_excerpt: Some(truncate_html(&html, request.html_excerpt_bytes)),
754            extracted: None,
755        })
756    }
757
758    #[cfg(not(feature = "tls-config"))]
759    fn run_http_stage_impl(
760        &self,
761        _request: &AcquisitionRequest,
762        tls_profiled: bool,
763    ) -> StageOutcome {
764        let strategy = if tls_profiled {
765            StrategyUsed::TlsProfiledHttp
766        } else {
767            StrategyUsed::DirectHttp
768        };
769        StageOutcome::Failure(StageFailure {
770            strategy,
771            kind: StageFailureKind::Setup,
772            message: "HTTP acquisition requires the `tls-config` feature".to_string(),
773        })
774    }
775}
776
777#[cfg(feature = "browserbase")]
778#[derive(Debug, Clone)]
779struct BrowserbaseSession {
780    id: String,
781    connect_url: String,
782}
783
784#[cfg(feature = "browserbase")]
785async fn create_browserbase_session(
786    request: &AcquisitionRequest,
787    api_key: &str,
788    project_id: &str,
789) -> Result<BrowserbaseSession, BrowserError> {
790    let client = reqwest::Client::builder()
791        .timeout(request.request_timeout)
792        .build()
793        .map_err(|err| {
794            BrowserError::ConfigError(format!("browserbase client setup failed: {err}"))
795        })?;
796
797    let create_url = format!("{}/sessions", browserbase_api_base());
798    let response = client
799        .post(create_url.clone())
800        .bearer_auth(api_key)
801        .header("x-bb-api-key", api_key)
802        .json(&serde_json::json!({ "projectId": project_id }))
803        .send()
804        .await
805        .map_err(|err| BrowserError::ConnectionError {
806            url: create_url.clone(),
807            reason: err.to_string(),
808        })?;
809
810    if !response.status().is_success() {
811        let status = response.status();
812        let body = response.text().await.unwrap_or_default();
813        return Err(BrowserError::ConnectionError {
814            url: create_url,
815            reason: format!("session create failed ({status}): {body}"),
816        });
817    }
818
819    let payload: Value = response
820        .json()
821        .await
822        .map_err(|err| BrowserError::ConnectionError {
823            url: browserbase_api_base(),
824            reason: format!("session create response parse failed: {err}"),
825        })?;
826
827    let connect_url = browserbase_connect_url(&payload).ok_or_else(|| {
828        BrowserError::ConfigError("browserbase response missing connect URL".to_string())
829    })?;
830    let session_id = browserbase_session_id(&payload).ok_or_else(|| {
831        BrowserError::ConfigError("browserbase response missing session id".to_string())
832    })?;
833
834    Ok(BrowserbaseSession {
835        id: session_id,
836        connect_url,
837    })
838}
839
840#[cfg(feature = "browserbase")]
841async fn delete_browserbase_session(
842    request: &AcquisitionRequest,
843    api_key: &str,
844    session_id: &str,
845) -> Result<(), BrowserError> {
846    let client = reqwest::Client::builder()
847        .timeout(request.request_timeout)
848        .build()
849        .map_err(|err| {
850            BrowserError::ConfigError(format!("browserbase client setup failed: {err}"))
851        })?;
852
853    let delete_url = format!("{}/sessions/{session_id}", browserbase_api_base());
854    let response = client
855        .delete(delete_url.clone())
856        .bearer_auth(api_key)
857        .header("x-bb-api-key", api_key)
858        .send()
859        .await
860        .map_err(|err| BrowserError::ConnectionError {
861            url: delete_url.clone(),
862            reason: err.to_string(),
863        })?;
864
865    if response.status().is_success() {
866        Ok(())
867    } else {
868        Err(BrowserError::ConnectionError {
869            url: delete_url,
870            reason: format!("session delete failed with status {}", response.status()),
871        })
872    }
873}
874
875#[cfg(feature = "browserbase")]
876fn browserbase_api_base() -> String {
877    std::env::var("BROWSERBASE_API_BASE")
878        .unwrap_or_else(|_| "https://api.browserbase.com/v1".to_string())
879        .trim_end_matches('/')
880        .to_string()
881}
882
883#[cfg(feature = "browserbase")]
884fn browserbase_session_id(payload: &Value) -> Option<String> {
885    payload
886        .get("id")
887        .or_else(|| payload.get("sessionId"))
888        .or_else(|| payload.get("session_id"))
889        .or_else(|| payload.get("data").and_then(|v| v.get("id")))
890        .or_else(|| payload.get("data").and_then(|v| v.get("sessionId")))
891        .or_else(|| payload.get("data").and_then(|v| v.get("session_id")))
892        .and_then(Value::as_str)
893        .map(ToString::to_string)
894}
895
896#[cfg(feature = "browserbase")]
897fn browserbase_connect_url(payload: &Value) -> Option<String> {
898    [
899        "connectUrl",
900        "connect_url",
901        "wsUrl",
902        "ws_url",
903        "websocketUrl",
904        "websocket_url",
905        "browserWSEndpoint",
906        "wsEndpoint",
907        "ws_endpoint",
908    ]
909    .iter()
910    .find_map(|key| payload.get(*key).and_then(Value::as_str))
911    .or_else(|| {
912        payload.get("data").and_then(|data| {
913            [
914                "connectUrl",
915                "connect_url",
916                "wsUrl",
917                "ws_url",
918                "websocketUrl",
919                "websocket_url",
920                "browserWSEndpoint",
921                "wsEndpoint",
922                "ws_endpoint",
923            ]
924            .iter()
925            .find_map(|key| data.get(*key).and_then(Value::as_str))
926        })
927    })
928    .map(ToString::to_string)
929}
930
931fn dedupe_preserve_order(stages: &mut Vec<StrategyUsed>) {
932    let mut seen = Vec::new();
933    stages.retain(|stage| {
934        if seen.contains(stage) {
935            false
936        } else {
937            seen.push(*stage);
938            true
939        }
940    });
941}
942
943#[cfg(feature = "browserbase")]
944fn maybe_insert_browserbase_stage(stages: &mut Vec<StrategyUsed>, enabled: bool) {
945    if !enabled || stages.contains(&StrategyUsed::BrowserbaseManagedSession) {
946        return;
947    }
948
949    if let Some(pos) = stages
950        .iter()
951        .position(|stage| *stage == StrategyUsed::StickyProxyBrowserSession)
952    {
953        stages.insert(pos, StrategyUsed::BrowserbaseManagedSession);
954    } else {
955        stages.push(StrategyUsed::BrowserbaseManagedSession);
956    }
957}
958
959fn classify_browser_error(error: &BrowserError) -> StageFailureKind {
960    match error {
961        BrowserError::Timeout { .. } => StageFailureKind::Timeout,
962        BrowserError::NavigationFailed { reason, .. } if reason.contains("selector") => {
963            StageFailureKind::Blocked
964        }
965        BrowserError::ScriptExecutionFailed { .. } => StageFailureKind::Extraction,
966        BrowserError::ConfigError(_) | BrowserError::PoolExhausted { .. } => {
967            StageFailureKind::Setup
968        }
969        BrowserError::ProxyUnavailable { .. }
970        | BrowserError::ConnectionError { .. }
971        | BrowserError::CdpError { .. }
972        | BrowserError::LaunchFailed { .. }
973        | BrowserError::NavigationFailed { .. }
974        | BrowserError::Io(_)
975        | BrowserError::StaleNode { .. } => StageFailureKind::Transport,
976        #[cfg(feature = "extract")]
977        BrowserError::ExtractionFailed(_) => StageFailureKind::Extraction,
978    }
979}
980
981const fn is_block_status(status: Option<u16>) -> bool {
982    matches!(status, Some(401 | 403 | 407 | 429 | 503))
983}
984
985fn truncate_html(html: &str, max_bytes: usize) -> String {
986    if html.len() <= max_bytes {
987        return html.to_string();
988    }
989
990    let mut out = String::new();
991    for ch in html.chars() {
992        if out.len() + ch.len_utf8() > max_bytes {
993            break;
994        }
995        out.push(ch);
996    }
997    out
998}
999
1000fn host_hint(url: &str) -> Option<String> {
1001    let without_scheme = url.split_once("://")?.1;
1002    let authority = without_scheme.split('/').next()?;
1003    let host = authority.rsplit('@').next()?.split(':').next()?;
1004    if host.is_empty() {
1005        None
1006    } else {
1007        Some(host.to_ascii_lowercase())
1008    }
1009}
1010
1011#[cfg(test)]
1012mod tests {
1013    use super::*;
1014
1015    #[test]
1016    fn ladder_is_deterministic_for_modes() {
1017        assert_eq!(
1018            AcquisitionRunner::strategy_ladder(AcquisitionMode::Fast, None),
1019            vec![
1020                StrategyUsed::DirectHttp,
1021                StrategyUsed::TlsProfiledHttp,
1022                StrategyUsed::BrowserLightStealth,
1023            ]
1024        );
1025
1026        assert_eq!(
1027            AcquisitionRunner::strategy_ladder(
1028                AcquisitionMode::Investigate,
1029                Some(StrategyUsed::StickyProxyBrowserSession)
1030            ),
1031            vec![
1032                StrategyUsed::InvestigateEntry,
1033                StrategyUsed::StickyProxyBrowserSession,
1034                StrategyUsed::TlsProfiledHttp,
1035            ]
1036        );
1037    }
1038
1039    #[test]
1040    fn block_statuses_are_classified() {
1041        assert!(is_block_status(Some(403)));
1042        assert!(is_block_status(Some(429)));
1043        assert!(!is_block_status(Some(200)));
1044        assert!(!is_block_status(None));
1045    }
1046
1047    #[test]
1048    fn host_hint_extracts_authority() {
1049        assert_eq!(
1050            host_hint("https://user:pass@example.com:8443/path"),
1051            Some("example.com".to_string())
1052        );
1053    }
1054
1055    #[test]
1056    fn truncate_html_respects_utf8_boundaries() {
1057        let src = "abc😀def";
1058        let out = truncate_html(src, 5);
1059        assert_eq!(out, "abc");
1060    }
1061
1062    #[cfg(feature = "browserbase")]
1063    #[test]
1064    fn browserbase_connect_url_is_extracted_from_nested_data() {
1065        let payload = serde_json::json!({
1066            "data": {
1067                "connectUrl": "wss://connect.browserbase.example/devtools/browser/abc"
1068            }
1069        });
1070
1071        assert_eq!(
1072            browserbase_connect_url(&payload),
1073            Some("wss://connect.browserbase.example/devtools/browser/abc".to_string())
1074        );
1075    }
1076
1077    #[cfg(feature = "browserbase")]
1078    #[test]
1079    fn browserbase_stage_is_inserted_before_sticky_stage() {
1080        let mut ladder = vec![
1081            StrategyUsed::DirectHttp,
1082            StrategyUsed::StickyProxyBrowserSession,
1083            StrategyUsed::TlsProfiledHttp,
1084        ];
1085
1086        maybe_insert_browserbase_stage(&mut ladder, true);
1087
1088        assert_eq!(
1089            ladder,
1090            vec![
1091                StrategyUsed::DirectHttp,
1092                StrategyUsed::BrowserbaseManagedSession,
1093                StrategyUsed::StickyProxyBrowserSession,
1094                StrategyUsed::TlsProfiledHttp,
1095            ]
1096        );
1097    }
1098}