1use std::sync::Arc;
8use std::time::{Duration, Instant};
9
10#[cfg(feature = "browserbase")]
11use chromiumoxide::Browser;
12#[cfg(feature = "browserbase")]
13use futures::StreamExt;
14use serde::{Deserialize, Serialize};
15use serde_json::Value;
16#[cfg(feature = "browserbase")]
17use tokio::time::timeout;
18
19use crate::BrowserPool;
20use crate::error::BrowserError;
21use crate::page::WaitUntil;
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum AcquisitionMode {
27 Fast,
29 Resilient,
31 Hostile,
33 Investigate,
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
39#[serde(rename_all = "snake_case")]
40pub enum StrategyUsed {
41 DirectHttp,
43 TlsProfiledHttp,
45 BrowserLightStealth,
47 StickyProxyBrowserSession,
49 #[cfg(feature = "browserbase")]
51 BrowserbaseManagedSession,
52 InvestigateEntry,
54}
55
56#[derive(Debug, Clone)]
58pub struct AcquisitionRequest {
59 pub url: String,
61 pub mode: AcquisitionMode,
63 pub wait_for_selector: Option<String>,
65 pub extraction_js: Option<String>,
67 pub total_timeout: Duration,
69 pub navigation_timeout: Duration,
71 pub request_timeout: Duration,
73 pub html_excerpt_bytes: usize,
75 pub investigate_start: Option<StrategyUsed>,
77 pub browserbase_enabled: bool,
79}
80
81impl Default for AcquisitionRequest {
82 fn default() -> Self {
83 Self {
84 url: String::new(),
85 mode: AcquisitionMode::Resilient,
86 wait_for_selector: None,
87 extraction_js: None,
88 total_timeout: Duration::from_secs(45),
89 navigation_timeout: Duration::from_secs(30),
90 request_timeout: Duration::from_secs(15),
91 html_excerpt_bytes: 4_096,
92 investigate_start: None,
93 browserbase_enabled: false,
94 }
95 }
96}
97
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
100#[serde(rename_all = "snake_case")]
101pub enum StageFailureKind {
102 Setup,
104 Timeout,
106 Blocked,
108 Transport,
110 Extraction,
112}
113
114#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
116pub struct StageFailure {
117 pub strategy: StrategyUsed,
119 pub kind: StageFailureKind,
121 pub message: String,
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct AcquisitionResult {
128 pub success: bool,
130 pub strategy_used: Option<StrategyUsed>,
132 pub attempted: Vec<StrategyUsed>,
134 pub final_url: Option<String>,
136 pub status_code: Option<u16>,
138 pub html_excerpt: Option<String>,
140 pub extracted: Option<Value>,
142 pub failures: Vec<StageFailure>,
144 pub timed_out: bool,
146}
147
148impl AcquisitionResult {
149 const fn empty() -> Self {
150 Self {
151 success: false,
152 strategy_used: None,
153 attempted: Vec::new(),
154 final_url: None,
155 status_code: None,
156 html_excerpt: None,
157 extracted: None,
158 failures: Vec::new(),
159 timed_out: false,
160 }
161 }
162}
163
164#[derive(Debug, Clone)]
165struct StageSuccess {
166 final_url: Option<String>,
167 status_code: Option<u16>,
168 html_excerpt: Option<String>,
169 extracted: Option<Value>,
170}
171
172#[derive(Debug, Clone)]
173enum StageOutcome {
174 Marker,
175 Success(StageSuccess),
176 Failure(StageFailure),
177}
178
179#[derive(Clone)]
181pub struct AcquisitionRunner {
182 pool: Arc<BrowserPool>,
183}
184
185impl AcquisitionRunner {
186 #[must_use]
200 pub const fn new(pool: Arc<BrowserPool>) -> Self {
201 Self { pool }
202 }
203
204 #[must_use]
208 pub fn strategy_ladder(
209 mode: AcquisitionMode,
210 investigate_start: Option<StrategyUsed>,
211 ) -> Vec<StrategyUsed> {
212 let mut stages = match mode {
213 AcquisitionMode::Fast => vec![
214 StrategyUsed::DirectHttp,
215 StrategyUsed::TlsProfiledHttp,
216 StrategyUsed::BrowserLightStealth,
217 ],
218 AcquisitionMode::Resilient => vec![
219 StrategyUsed::DirectHttp,
220 StrategyUsed::TlsProfiledHttp,
221 StrategyUsed::BrowserLightStealth,
222 StrategyUsed::StickyProxyBrowserSession,
223 ],
224 AcquisitionMode::Hostile => vec![
225 StrategyUsed::BrowserLightStealth,
226 StrategyUsed::StickyProxyBrowserSession,
227 StrategyUsed::TlsProfiledHttp,
228 StrategyUsed::DirectHttp,
229 ],
230 AcquisitionMode::Investigate => {
231 let start = investigate_start.unwrap_or(StrategyUsed::BrowserLightStealth);
232 vec![
233 StrategyUsed::InvestigateEntry,
234 start,
235 StrategyUsed::StickyProxyBrowserSession,
236 StrategyUsed::TlsProfiledHttp,
237 ]
238 }
239 };
240
241 dedupe_preserve_order(&mut stages);
242 stages
243 }
244
245 pub async fn run(&self, request: AcquisitionRequest) -> AcquisitionResult {
268 let timeout = request.total_timeout;
269 let timeout_strategy = Self::strategy_ladder(request.mode, request.investigate_start)
270 .into_iter()
271 .find(|strategy| *strategy != StrategyUsed::InvestigateEntry)
272 .unwrap_or(StrategyUsed::DirectHttp);
273 let mut result = tokio::time::timeout(timeout, self.run_inner(&request))
274 .await
275 .unwrap_or_else(|_| {
276 let mut timed_out = AcquisitionResult::empty();
277 timed_out.timed_out = true;
278 timed_out.failures.push(StageFailure {
279 strategy: timeout_strategy,
280 kind: StageFailureKind::Timeout,
281 message: format!("acquisition timed out after {}ms", timeout.as_millis()),
282 });
283 timed_out
284 });
285
286 if !result.success {
287 if result.failures.is_empty() {
289 result.failures.push(StageFailure {
290 strategy: timeout_strategy,
291 kind: StageFailureKind::Transport,
292 message: "acquisition ended without stage output".to_string(),
293 });
294 }
295 }
296
297 result
298 }
299
300 async fn run_inner(&self, request: &AcquisitionRequest) -> AcquisitionResult {
301 let mut result = AcquisitionResult::empty();
302
303 #[cfg(feature = "browserbase")]
304 let mut ladder = Self::strategy_ladder(request.mode, request.investigate_start);
305
306 #[cfg(not(feature = "browserbase"))]
307 let ladder = Self::strategy_ladder(request.mode, request.investigate_start);
308
309 #[cfg(feature = "browserbase")]
310 {
311 maybe_insert_browserbase_stage(&mut ladder, request.browserbase_enabled);
312 }
313 let started = Instant::now();
314
315 for strategy in ladder {
316 if started.elapsed() >= request.total_timeout {
317 result.timed_out = true;
318 result.failures.push(StageFailure {
319 strategy,
320 kind: StageFailureKind::Timeout,
321 message: "wall-clock timeout reached before stage execution".to_string(),
322 });
323 break;
324 }
325
326 result.attempted.push(strategy);
327 match self.execute_stage(strategy, request).await {
328 StageOutcome::Marker => {}
329 StageOutcome::Success(success) => {
330 result.success = true;
331 result.strategy_used = Some(strategy);
332 result.final_url = success.final_url;
333 result.status_code = success.status_code;
334 result.html_excerpt = success.html_excerpt;
335 result.extracted = success.extracted;
336 break;
337 }
338 StageOutcome::Failure(failure) => result.failures.push(failure),
339 }
340 }
341
342 result
343 }
344
345 async fn execute_stage(
346 &self,
347 strategy: StrategyUsed,
348 request: &AcquisitionRequest,
349 ) -> StageOutcome {
350 match strategy {
351 StrategyUsed::DirectHttp => {
352 #[cfg(feature = "tls-config")]
353 {
354 self.run_http_stage(request, false).await
355 }
356
357 #[cfg(not(feature = "tls-config"))]
358 {
359 self.run_http_stage(request, false)
360 }
361 }
362 StrategyUsed::TlsProfiledHttp => {
363 #[cfg(feature = "tls-config")]
364 {
365 self.run_http_stage(request, true).await
366 }
367
368 #[cfg(not(feature = "tls-config"))]
369 {
370 self.run_http_stage(request, true)
371 }
372 }
373 StrategyUsed::BrowserLightStealth => self.run_browser_stage(request, false).await,
374 StrategyUsed::StickyProxyBrowserSession => self.run_browser_stage(request, true).await,
375 #[cfg(feature = "browserbase")]
376 StrategyUsed::BrowserbaseManagedSession => Self::run_browserbase_stage(request).await,
377 StrategyUsed::InvestigateEntry => StageOutcome::Marker,
378 }
379 }
380
381 #[cfg(feature = "browserbase")]
382 #[allow(clippy::too_many_lines)]
383 async fn run_browserbase_stage(request: &AcquisitionRequest) -> StageOutcome {
384 if !request.browserbase_enabled {
385 return StageOutcome::Failure(StageFailure {
386 strategy: StrategyUsed::BrowserbaseManagedSession,
387 kind: StageFailureKind::Setup,
388 message: "browserbase stage disabled for this request".to_string(),
389 });
390 }
391
392 let api_key = match std::env::var("BROWSERBASE_API_KEY") {
393 Ok(value) if !value.trim().is_empty() => value,
394 _ => {
395 return StageOutcome::Failure(StageFailure {
396 strategy: StrategyUsed::BrowserbaseManagedSession,
397 kind: StageFailureKind::Setup,
398 message: "browserbase requires BROWSERBASE_API_KEY".to_string(),
399 });
400 }
401 };
402
403 let project_id = match std::env::var("BROWSERBASE_PROJECT_ID") {
404 Ok(value) if !value.trim().is_empty() => value,
405 _ => {
406 return StageOutcome::Failure(StageFailure {
407 strategy: StrategyUsed::BrowserbaseManagedSession,
408 kind: StageFailureKind::Setup,
409 message: "browserbase requires BROWSERBASE_PROJECT_ID".to_string(),
410 });
411 }
412 };
413
414 let session = match create_browserbase_session(request, &api_key, &project_id).await {
415 Ok(session) => session,
416 Err(err) => {
417 return StageOutcome::Failure(StageFailure {
418 strategy: StrategyUsed::BrowserbaseManagedSession,
419 kind: classify_browser_error(&err),
420 message: err.to_string(),
421 });
422 }
423 };
424
425 let connect_timeout = request.request_timeout.min(request.total_timeout);
426 let (mut browser, mut handler) = match timeout(
427 connect_timeout,
428 Browser::connect(session.connect_url.clone()),
429 )
430 .await
431 {
432 Ok(Ok(pair)) => pair,
433 Ok(Err(err)) => {
434 let _ = delete_browserbase_session(request, &api_key, &session.id).await;
435 return StageOutcome::Failure(StageFailure {
436 strategy: StrategyUsed::BrowserbaseManagedSession,
437 kind: StageFailureKind::Transport,
438 message: format!("browserbase connect failed: {err}"),
439 });
440 }
441 Err(_) => {
442 let _ = delete_browserbase_session(request, &api_key, &session.id).await;
443 return StageOutcome::Failure(StageFailure {
444 strategy: StrategyUsed::BrowserbaseManagedSession,
445 kind: StageFailureKind::Timeout,
446 message: format!(
447 "browserbase connect timed out after {}ms",
448 connect_timeout.as_millis()
449 ),
450 });
451 }
452 };
453
454 let handler_task = tokio::spawn(async move {
455 while let Some(event) = handler.next().await {
456 if let Err(error) = event {
457 tracing::warn!(%error, "browserbase handler error");
458 break;
459 }
460 }
461 });
462
463 let run_result =
464 async {
465 let raw_page = browser.new_page("about:blank").await.map_err(|err| {
466 BrowserError::CdpError {
467 operation: "Browser.newPage".to_string(),
468 message: err.to_string(),
469 }
470 })?;
471
472 let mut page = crate::page::PageHandle::new(raw_page, request.navigation_timeout);
473
474 page.navigate(
475 &request.url,
476 WaitUntil::DomContentLoaded,
477 request.navigation_timeout,
478 )
479 .await?;
480
481 if let Some(selector) = &request.wait_for_selector {
482 page.wait_for_selector(selector, request.navigation_timeout)
483 .await?;
484 }
485
486 let extracted = match request.extraction_js.as_deref() {
487 Some(script) => Some(page.eval::<Value>(script).await.map_err(|err| {
488 BrowserError::ScriptExecutionFailed {
489 script: script.to_string(),
490 reason: err.to_string(),
491 }
492 })?),
493 None => None,
494 };
495
496 let html = page.content().await?;
497 let final_url = page.url().await.ok();
498 let status_code = page.status_code().ok().flatten();
499
500 Ok::<StageSuccess, BrowserError>(StageSuccess {
501 final_url,
502 status_code,
503 html_excerpt: Some(truncate_html(&html, request.html_excerpt_bytes)),
504 extracted,
505 })
506 }
507 .await;
508
509 let _ = timeout(Duration::from_secs(5), browser.close()).await;
510 handler_task.abort();
511 let _ = delete_browserbase_session(request, &api_key, &session.id).await;
512
513 match run_result {
514 Ok(success) => {
515 if is_block_status(success.status_code) {
516 StageOutcome::Failure(StageFailure {
517 strategy: StrategyUsed::BrowserbaseManagedSession,
518 kind: StageFailureKind::Blocked,
519 message: format!(
520 "blocked status during browserbase stage: {:?}",
521 success.status_code
522 ),
523 })
524 } else {
525 StageOutcome::Success(success)
526 }
527 }
528 Err(err) => StageOutcome::Failure(StageFailure {
529 strategy: StrategyUsed::BrowserbaseManagedSession,
530 kind: classify_browser_error(&err),
531 message: err.to_string(),
532 }),
533 }
534 }
535
536 async fn run_browser_stage(&self, request: &AcquisitionRequest, sticky: bool) -> StageOutcome {
537 let strategy = if sticky {
538 StrategyUsed::StickyProxyBrowserSession
539 } else {
540 StrategyUsed::BrowserLightStealth
541 };
542
543 let handle_result = if sticky {
544 let context = host_hint(&request.url).unwrap_or_else(|| "default".to_string());
545 self.pool.acquire_for(&context).await
546 } else {
547 self.pool.acquire().await
548 };
549
550 let handle = match handle_result {
551 Ok(handle) => handle,
552 Err(err) => {
553 return StageOutcome::Failure(StageFailure {
554 strategy,
555 kind: StageFailureKind::Setup,
556 message: format!("browser acquire failed: {err}"),
557 });
558 }
559 };
560
561 let page_result = async {
562 let browser = handle.browser().ok_or_else(|| {
563 BrowserError::ConfigError("browser handle already released".to_string())
564 })?;
565 let mut page = browser.new_page().await?;
566 page.navigate(
567 &request.url,
568 WaitUntil::DomContentLoaded,
569 request.navigation_timeout,
570 )
571 .await?;
572
573 if let Some(selector) = &request.wait_for_selector {
574 page.wait_for_selector(selector, request.navigation_timeout)
575 .await?;
576 }
577
578 let extracted = match request.extraction_js.as_deref() {
579 Some(script) => Some(page.eval::<Value>(script).await.map_err(|err| {
580 BrowserError::ScriptExecutionFailed {
581 script: script.to_string(),
582 reason: err.to_string(),
583 }
584 })?),
585 None => None,
586 };
587
588 let html = page.content().await?;
589 let final_url = page.url().await.ok();
590 let status_code = page.status_code().ok().flatten();
591 let html_excerpt = truncate_html(&html, request.html_excerpt_bytes);
592
593 drop(page);
594
595 Ok::<StageSuccess, BrowserError>(StageSuccess {
596 final_url,
597 status_code,
598 html_excerpt: Some(html_excerpt),
599 extracted,
600 })
601 }
602 .await;
603
604 handle.release().await;
605
606 match page_result {
607 Ok(success) => {
608 if is_block_status(success.status_code) {
609 StageOutcome::Failure(StageFailure {
610 strategy,
611 kind: StageFailureKind::Blocked,
612 message: format!(
613 "blocked status during browser stage: {:?}",
614 success.status_code
615 ),
616 })
617 } else {
618 StageOutcome::Success(success)
619 }
620 }
621 Err(err) => StageOutcome::Failure(StageFailure {
622 strategy,
623 kind: classify_browser_error(&err),
624 message: err.to_string(),
625 }),
626 }
627 }
628
629 #[cfg(feature = "tls-config")]
630 async fn run_http_stage(
631 &self,
632 request: &AcquisitionRequest,
633 tls_profiled: bool,
634 ) -> StageOutcome {
635 if request.wait_for_selector.is_some() || request.extraction_js.is_some() {
636 return StageOutcome::Failure(StageFailure {
637 strategy: if tls_profiled {
638 StrategyUsed::TlsProfiledHttp
639 } else {
640 StrategyUsed::DirectHttp
641 },
642 kind: StageFailureKind::Extraction,
643 message: "HTTP stages cannot satisfy selector/extraction requirements".to_string(),
644 });
645 }
646
647 self.run_http_stage_impl(request, tls_profiled).await
648 }
649
650 #[cfg(not(feature = "tls-config"))]
651 fn run_http_stage(&self, request: &AcquisitionRequest, tls_profiled: bool) -> StageOutcome {
652 if request.wait_for_selector.is_some() || request.extraction_js.is_some() {
653 return StageOutcome::Failure(StageFailure {
654 strategy: if tls_profiled {
655 StrategyUsed::TlsProfiledHttp
656 } else {
657 StrategyUsed::DirectHttp
658 },
659 kind: StageFailureKind::Extraction,
660 message: "HTTP stages cannot satisfy selector/extraction requirements".to_string(),
661 });
662 }
663
664 self.run_http_stage_impl(request, tls_profiled)
665 }
666
667 #[cfg(feature = "tls-config")]
668 async fn run_http_stage_impl(
669 &self,
670 request: &AcquisitionRequest,
671 tls_profiled: bool,
672 ) -> StageOutcome {
673 use crate::tls::{CHROME_131, build_profiled_client_preset};
674
675 let strategy = if tls_profiled {
676 StrategyUsed::TlsProfiledHttp
677 } else {
678 StrategyUsed::DirectHttp
679 };
680
681 let client = if tls_profiled {
682 match build_profiled_client_preset(&CHROME_131, None) {
683 Ok(client) => client,
684 Err(err) => {
685 return StageOutcome::Failure(StageFailure {
686 strategy,
687 kind: StageFailureKind::Setup,
688 message: format!("tls-profiled client setup failed: {err}"),
689 });
690 }
691 }
692 } else {
693 match reqwest::Client::builder()
694 .timeout(request.request_timeout)
695 .cookie_store(true)
696 .build()
697 {
698 Ok(client) => client,
699 Err(err) => {
700 return StageOutcome::Failure(StageFailure {
701 strategy,
702 kind: StageFailureKind::Setup,
703 message: format!("http client setup failed: {err}"),
704 });
705 }
706 }
707 };
708
709 let response = match client
710 .get(&request.url)
711 .timeout(request.request_timeout)
712 .send()
713 .await
714 {
715 Ok(response) => response,
716 Err(err) => {
717 return StageOutcome::Failure(StageFailure {
718 strategy,
719 kind: if err.is_timeout() {
720 StageFailureKind::Timeout
721 } else {
722 StageFailureKind::Transport
723 },
724 message: err.to_string(),
725 });
726 }
727 };
728
729 let status_code = Some(response.status().as_u16());
730 let final_url = Some(response.url().to_string());
731 let html = match response.text().await {
732 Ok(text) => text,
733 Err(err) => {
734 return StageOutcome::Failure(StageFailure {
735 strategy,
736 kind: StageFailureKind::Transport,
737 message: format!("response body read failed: {err}"),
738 });
739 }
740 };
741
742 if is_block_status(status_code) {
743 return StageOutcome::Failure(StageFailure {
744 strategy,
745 kind: StageFailureKind::Blocked,
746 message: format!("blocked status from HTTP stage: {status_code:?}"),
747 });
748 }
749
750 StageOutcome::Success(StageSuccess {
751 final_url,
752 status_code,
753 html_excerpt: Some(truncate_html(&html, request.html_excerpt_bytes)),
754 extracted: None,
755 })
756 }
757
758 #[cfg(not(feature = "tls-config"))]
759 fn run_http_stage_impl(
760 &self,
761 _request: &AcquisitionRequest,
762 tls_profiled: bool,
763 ) -> StageOutcome {
764 let strategy = if tls_profiled {
765 StrategyUsed::TlsProfiledHttp
766 } else {
767 StrategyUsed::DirectHttp
768 };
769 StageOutcome::Failure(StageFailure {
770 strategy,
771 kind: StageFailureKind::Setup,
772 message: "HTTP acquisition requires the `tls-config` feature".to_string(),
773 })
774 }
775}
776
777#[cfg(feature = "browserbase")]
778#[derive(Debug, Clone)]
779struct BrowserbaseSession {
780 id: String,
781 connect_url: String,
782}
783
784#[cfg(feature = "browserbase")]
785async fn create_browserbase_session(
786 request: &AcquisitionRequest,
787 api_key: &str,
788 project_id: &str,
789) -> Result<BrowserbaseSession, BrowserError> {
790 let client = reqwest::Client::builder()
791 .timeout(request.request_timeout)
792 .build()
793 .map_err(|err| {
794 BrowserError::ConfigError(format!("browserbase client setup failed: {err}"))
795 })?;
796
797 let create_url = format!("{}/sessions", browserbase_api_base());
798 let response = client
799 .post(create_url.clone())
800 .bearer_auth(api_key)
801 .header("x-bb-api-key", api_key)
802 .json(&serde_json::json!({ "projectId": project_id }))
803 .send()
804 .await
805 .map_err(|err| BrowserError::ConnectionError {
806 url: create_url.clone(),
807 reason: err.to_string(),
808 })?;
809
810 if !response.status().is_success() {
811 let status = response.status();
812 let body = response.text().await.unwrap_or_default();
813 return Err(BrowserError::ConnectionError {
814 url: create_url,
815 reason: format!("session create failed ({status}): {body}"),
816 });
817 }
818
819 let payload: Value = response
820 .json()
821 .await
822 .map_err(|err| BrowserError::ConnectionError {
823 url: browserbase_api_base(),
824 reason: format!("session create response parse failed: {err}"),
825 })?;
826
827 let connect_url = browserbase_connect_url(&payload).ok_or_else(|| {
828 BrowserError::ConfigError("browserbase response missing connect URL".to_string())
829 })?;
830 let session_id = browserbase_session_id(&payload).ok_or_else(|| {
831 BrowserError::ConfigError("browserbase response missing session id".to_string())
832 })?;
833
834 Ok(BrowserbaseSession {
835 id: session_id,
836 connect_url,
837 })
838}
839
840#[cfg(feature = "browserbase")]
841async fn delete_browserbase_session(
842 request: &AcquisitionRequest,
843 api_key: &str,
844 session_id: &str,
845) -> Result<(), BrowserError> {
846 let client = reqwest::Client::builder()
847 .timeout(request.request_timeout)
848 .build()
849 .map_err(|err| {
850 BrowserError::ConfigError(format!("browserbase client setup failed: {err}"))
851 })?;
852
853 let delete_url = format!("{}/sessions/{session_id}", browserbase_api_base());
854 let response = client
855 .delete(delete_url.clone())
856 .bearer_auth(api_key)
857 .header("x-bb-api-key", api_key)
858 .send()
859 .await
860 .map_err(|err| BrowserError::ConnectionError {
861 url: delete_url.clone(),
862 reason: err.to_string(),
863 })?;
864
865 if response.status().is_success() {
866 Ok(())
867 } else {
868 Err(BrowserError::ConnectionError {
869 url: delete_url,
870 reason: format!("session delete failed with status {}", response.status()),
871 })
872 }
873}
874
875#[cfg(feature = "browserbase")]
876fn browserbase_api_base() -> String {
877 std::env::var("BROWSERBASE_API_BASE")
878 .unwrap_or_else(|_| "https://api.browserbase.com/v1".to_string())
879 .trim_end_matches('/')
880 .to_string()
881}
882
883#[cfg(feature = "browserbase")]
884fn browserbase_session_id(payload: &Value) -> Option<String> {
885 payload
886 .get("id")
887 .or_else(|| payload.get("sessionId"))
888 .or_else(|| payload.get("session_id"))
889 .or_else(|| payload.get("data").and_then(|v| v.get("id")))
890 .or_else(|| payload.get("data").and_then(|v| v.get("sessionId")))
891 .or_else(|| payload.get("data").and_then(|v| v.get("session_id")))
892 .and_then(Value::as_str)
893 .map(ToString::to_string)
894}
895
896#[cfg(feature = "browserbase")]
897fn browserbase_connect_url(payload: &Value) -> Option<String> {
898 [
899 "connectUrl",
900 "connect_url",
901 "wsUrl",
902 "ws_url",
903 "websocketUrl",
904 "websocket_url",
905 "browserWSEndpoint",
906 "wsEndpoint",
907 "ws_endpoint",
908 ]
909 .iter()
910 .find_map(|key| payload.get(*key).and_then(Value::as_str))
911 .or_else(|| {
912 payload.get("data").and_then(|data| {
913 [
914 "connectUrl",
915 "connect_url",
916 "wsUrl",
917 "ws_url",
918 "websocketUrl",
919 "websocket_url",
920 "browserWSEndpoint",
921 "wsEndpoint",
922 "ws_endpoint",
923 ]
924 .iter()
925 .find_map(|key| data.get(*key).and_then(Value::as_str))
926 })
927 })
928 .map(ToString::to_string)
929}
930
931fn dedupe_preserve_order(stages: &mut Vec<StrategyUsed>) {
932 let mut seen = Vec::new();
933 stages.retain(|stage| {
934 if seen.contains(stage) {
935 false
936 } else {
937 seen.push(*stage);
938 true
939 }
940 });
941}
942
943#[cfg(feature = "browserbase")]
944fn maybe_insert_browserbase_stage(stages: &mut Vec<StrategyUsed>, enabled: bool) {
945 if !enabled || stages.contains(&StrategyUsed::BrowserbaseManagedSession) {
946 return;
947 }
948
949 if let Some(pos) = stages
950 .iter()
951 .position(|stage| *stage == StrategyUsed::StickyProxyBrowserSession)
952 {
953 stages.insert(pos, StrategyUsed::BrowserbaseManagedSession);
954 } else {
955 stages.push(StrategyUsed::BrowserbaseManagedSession);
956 }
957}
958
959fn classify_browser_error(error: &BrowserError) -> StageFailureKind {
960 match error {
961 BrowserError::Timeout { .. } => StageFailureKind::Timeout,
962 BrowserError::NavigationFailed { reason, .. } if reason.contains("selector") => {
963 StageFailureKind::Blocked
964 }
965 BrowserError::ScriptExecutionFailed { .. } => StageFailureKind::Extraction,
966 BrowserError::ConfigError(_) | BrowserError::PoolExhausted { .. } => {
967 StageFailureKind::Setup
968 }
969 BrowserError::ProxyUnavailable { .. }
970 | BrowserError::ConnectionError { .. }
971 | BrowserError::CdpError { .. }
972 | BrowserError::LaunchFailed { .. }
973 | BrowserError::NavigationFailed { .. }
974 | BrowserError::Io(_)
975 | BrowserError::StaleNode { .. } => StageFailureKind::Transport,
976 #[cfg(feature = "extract")]
977 BrowserError::ExtractionFailed(_) => StageFailureKind::Extraction,
978 }
979}
980
981const fn is_block_status(status: Option<u16>) -> bool {
982 matches!(status, Some(401 | 403 | 407 | 429 | 503))
983}
984
985fn truncate_html(html: &str, max_bytes: usize) -> String {
986 if html.len() <= max_bytes {
987 return html.to_string();
988 }
989
990 let mut out = String::new();
991 for ch in html.chars() {
992 if out.len() + ch.len_utf8() > max_bytes {
993 break;
994 }
995 out.push(ch);
996 }
997 out
998}
999
1000fn host_hint(url: &str) -> Option<String> {
1001 let without_scheme = url.split_once("://")?.1;
1002 let authority = without_scheme.split('/').next()?;
1003 let host = authority.rsplit('@').next()?.split(':').next()?;
1004 if host.is_empty() {
1005 None
1006 } else {
1007 Some(host.to_ascii_lowercase())
1008 }
1009}
1010
1011#[cfg(test)]
1012mod tests {
1013 use super::*;
1014
1015 #[test]
1016 fn ladder_is_deterministic_for_modes() {
1017 assert_eq!(
1018 AcquisitionRunner::strategy_ladder(AcquisitionMode::Fast, None),
1019 vec![
1020 StrategyUsed::DirectHttp,
1021 StrategyUsed::TlsProfiledHttp,
1022 StrategyUsed::BrowserLightStealth,
1023 ]
1024 );
1025
1026 assert_eq!(
1027 AcquisitionRunner::strategy_ladder(
1028 AcquisitionMode::Investigate,
1029 Some(StrategyUsed::StickyProxyBrowserSession)
1030 ),
1031 vec![
1032 StrategyUsed::InvestigateEntry,
1033 StrategyUsed::StickyProxyBrowserSession,
1034 StrategyUsed::TlsProfiledHttp,
1035 ]
1036 );
1037 }
1038
1039 #[test]
1040 fn block_statuses_are_classified() {
1041 assert!(is_block_status(Some(403)));
1042 assert!(is_block_status(Some(429)));
1043 assert!(!is_block_status(Some(200)));
1044 assert!(!is_block_status(None));
1045 }
1046
1047 #[test]
1048 fn host_hint_extracts_authority() {
1049 assert_eq!(
1050 host_hint("https://user:pass@example.com:8443/path"),
1051 Some("example.com".to_string())
1052 );
1053 }
1054
1055 #[test]
1056 fn truncate_html_respects_utf8_boundaries() {
1057 let src = "abc😀def";
1058 let out = truncate_html(src, 5);
1059 assert_eq!(out, "abc");
1060 }
1061
1062 #[cfg(feature = "browserbase")]
1063 #[test]
1064 fn browserbase_connect_url_is_extracted_from_nested_data() {
1065 let payload = serde_json::json!({
1066 "data": {
1067 "connectUrl": "wss://connect.browserbase.example/devtools/browser/abc"
1068 }
1069 });
1070
1071 assert_eq!(
1072 browserbase_connect_url(&payload),
1073 Some("wss://connect.browserbase.example/devtools/browser/abc".to_string())
1074 );
1075 }
1076
1077 #[cfg(feature = "browserbase")]
1078 #[test]
1079 fn browserbase_stage_is_inserted_before_sticky_stage() {
1080 let mut ladder = vec![
1081 StrategyUsed::DirectHttp,
1082 StrategyUsed::StickyProxyBrowserSession,
1083 StrategyUsed::TlsProfiledHttp,
1084 ];
1085
1086 maybe_insert_browserbase_stage(&mut ladder, true);
1087
1088 assert_eq!(
1089 ladder,
1090 vec![
1091 StrategyUsed::DirectHttp,
1092 StrategyUsed::BrowserbaseManagedSession,
1093 StrategyUsed::StickyProxyBrowserSession,
1094 StrategyUsed::TlsProfiledHttp,
1095 ]
1096 );
1097 }
1098}