stygian_browser/page.rs
1//!
2//! ## Resource blocking
3//!
4//! ## Wait strategies
5//!
6//! [`PageHandle`] exposes three wait strategies via [`WaitUntil`]:
7//! - `DomContentLoaded` — fires when the HTML is parsed
8//!
9//! # Example
10//!
11//! ```no_run
12//! use stygian_browser::{BrowserPool, BrowserConfig};
13//! use stygian_browser::page::{ResourceFilter, WaitUntil};
14//! use std::time::Duration;
15//!
16//! # async fn run() -> stygian_browser::error::Result<()> {
17//! let pool = BrowserPool::new(BrowserConfig::default()).await?;
18//! let handle = pool.acquire().await?;
19//!
20//! let mut page = handle.browser().expect("valid browser").new_page().await?;
21//! page.set_resource_filter(ResourceFilter::block_media()).await?;
22//! page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
23//! let title = page.title().await?;
24//! println!("title: {title}");
25//! handle.release().await;
26//! # Ok(())
27//! # }
28//! ```
29
30use std::collections::HashMap;
31use std::sync::{
32 Arc,
33 atomic::{AtomicU16, Ordering},
34};
35use std::time::Duration;
36
37use chromiumoxide::Page;
38use serde::{Deserialize, Serialize};
39use tokio::time::timeout;
40use tracing::{debug, warn};
41
42use crate::error::{BrowserError, Result};
43
44// ─── ResourceType ─────────────────────────────────────────────────────────────
45
46/// CDP resource types that can be intercepted.
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub enum ResourceType {
49 /// `<img>`, `<picture>`, background images
50 Image,
51 /// Web fonts loaded via CSS `@font-face`
52 Font,
53 /// External CSS stylesheets
54 Stylesheet,
55 /// Media files (audio/video)
56 Media,
57}
58
59impl ResourceType {
60 pub const fn as_cdp_str(&self) -> &'static str {
61 match self {
62 Self::Image => "Image",
63 Self::Font => "Font",
64 Self::Stylesheet => "Stylesheet",
65 Self::Media => "Media",
66 }
67 }
68}
69
70// ─── ResourceFilter ───────────────────────────────────────────────────────────
71
72///
73/// # Example
74///
75/// ```
76/// use stygian_browser::page::ResourceFilter;
77/// let filter = ResourceFilter::block_media();
78/// assert!(filter.should_block("Image"));
79/// ```
80#[derive(Debug, Clone, Default)]
81pub struct ResourceFilter {
82 blocked: Vec<ResourceType>,
83}
84
85impl ResourceFilter {
86 /// Block all media resources (images, fonts, CSS, audio/video).
87 pub fn block_media() -> Self {
88 Self {
89 blocked: vec![
90 ResourceType::Image,
91 ResourceType::Font,
92 ResourceType::Stylesheet,
93 ResourceType::Media,
94 ],
95 }
96 }
97
98 pub fn block_images_and_fonts() -> Self {
99 Self {
100 blocked: vec![ResourceType::Image, ResourceType::Font],
101 }
102 }
103
104 #[must_use]
105 pub fn block(mut self, resource: ResourceType) -> Self {
106 if !self.blocked.contains(&resource) {
107 self.blocked.push(resource);
108 }
109 self
110 }
111
112 pub fn should_block(&self, cdp_type: &str) -> bool {
113 self.blocked
114 .iter()
115 .any(|r| r.as_cdp_str().eq_ignore_ascii_case(cdp_type))
116 }
117
118 pub const fn is_empty(&self) -> bool {
119 self.blocked.is_empty()
120 }
121}
122
123// ─── WaitUntil ────────────────────────────────────────────────────────────────
124
125///
126/// # Example
127///
128/// ```
129/// use stygian_browser::page::WaitUntil;
130/// ```
131/// Specifies what condition to wait for after a page navigation.
132#[derive(Debug, Clone)]
133pub enum WaitUntil {
134 /// Fires when the initial HTML is fully parsed, without waiting for
135 /// subresources such as images and stylesheets to finish loading.
136 DomContentLoaded,
137 NetworkIdle,
138 Selector(String),
139}
140
141// ─── NodeHandle ───────────────────────────────────────────────────────────────
142
143///
144/// more CDP `Runtime.callFunctionOn` calls against the held V8 remote object
145/// reference — no HTML serialisation occurs.
146///
147/// A handle becomes **stale** after page navigation or if the underlying DOM
148/// node is removed. Stale calls return [`BrowserError::StaleNode`] so callers
149/// can distinguish them from other CDP failures.
150///
151/// # Example
152///
153/// ```no_run
154/// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
155/// use std::time::Duration;
156///
157/// # async fn run() -> stygian_browser::error::Result<()> {
158/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
159/// let handle = pool.acquire().await?;
160/// let mut page = handle.browser().expect("valid browser").new_page().await?;
161/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
162/// # let nodes = page.query_selector_all("a").await?;
163/// # for node in &nodes {
164/// let href = node.attr("href").await?;
165/// let text = node.text_content().await?;
166/// println!("{text}: {href:?}");
167/// # }
168/// # Ok(())
169/// # }
170/// ```
171pub struct NodeHandle {
172 element: chromiumoxide::element::Element,
173 /// Shared via `Arc<str>` so all handles from a single query reuse the
174 /// same allocation rather than cloning a `String` per node.
175 selector: Arc<str>,
176 cdp_timeout: Duration,
177 /// during DOM traversal (parent / sibling navigation).
178 page: chromiumoxide::Page,
179}
180
181impl NodeHandle {
182 /// Return a single attribute value, or `None` if the attribute is absent.
183 ///
184 /// Issues one `Runtime.callFunctionOn` CDP call (`el.getAttribute(name)`).
185 ///
186 /// # Errors
187 ///
188 /// invalidated, or [`BrowserError::Timeout`] / [`BrowserError::CdpError`]
189 /// on transport-level failures.
190 pub async fn attr(&self, name: &str) -> Result<Option<String>> {
191 timeout(self.cdp_timeout, self.element.attribute(name))
192 .await
193 .map_err(|_| BrowserError::Timeout {
194 operation: "NodeHandle::attr".to_string(),
195 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
196 })?
197 .map_err(|e| self.cdp_err_or_stale(&e, "attr"))
198 }
199
200 /// Return all attributes as a `HashMap<name, value>` in a **single**
201 /// CDP round-trip.
202 ///
203 /// Uses `DOM.getAttributes` (via the chromiumoxide `attributes()` API)
204 /// which returns a flat `[name, value, name, value, …]` list from the node
205 /// description — no per-attribute calls are needed.
206 ///
207 /// # Errors
208 ///
209 /// invalidated.
210 pub async fn attr_map(&self) -> Result<HashMap<String, String>> {
211 let flat = timeout(self.cdp_timeout, self.element.attributes())
212 .await
213 .map_err(|_| BrowserError::Timeout {
214 operation: "NodeHandle::attr_map".to_string(),
215 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
216 })?
217 .map_err(|e| self.cdp_err_or_stale(&e, "attr_map"))?;
218
219 let mut map = HashMap::with_capacity(flat.len() / 2);
220 for pair in flat.chunks_exact(2) {
221 if let [name, value] = pair {
222 map.insert(name.clone(), value.clone());
223 }
224 }
225 Ok(map)
226 }
227
228 /// Return the element's `textContent` (all text inside, no markup).
229 ///
230 /// Reads the DOM `textContent` property via a single JS eval — this is the
231 /// raw text concatenation of all descendant text nodes, independent of
232 /// layout or visibility (unlike `innerText`).
233 ///
234 ///
235 /// # Errors
236 ///
237 /// invalidated.
238 pub async fn text_content(&self) -> Result<String> {
239 let returns = timeout(
240 self.cdp_timeout,
241 self.element
242 .call_js_fn(r"function() { return this.textContent ?? ''; }", true),
243 )
244 .await
245 .map_err(|_| BrowserError::Timeout {
246 operation: "NodeHandle::text_content".to_string(),
247 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
248 })?
249 .map_err(|e| self.cdp_err_or_stale(&e, "text_content"))?;
250
251 Ok(returns
252 .result
253 .value
254 .as_ref()
255 .and_then(|v| v.as_str())
256 .unwrap_or("")
257 .to_string())
258 }
259
260 /// Return the element's `innerHTML`.
261 ///
262 ///
263 /// # Errors
264 ///
265 /// invalidated.
266 pub async fn inner_html(&self) -> Result<String> {
267 timeout(self.cdp_timeout, self.element.inner_html())
268 .await
269 .map_err(|_| BrowserError::Timeout {
270 operation: "NodeHandle::inner_html".to_string(),
271 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
272 })?
273 .map_err(|e| self.cdp_err_or_stale(&e, "inner_html"))
274 .map(Option::unwrap_or_default)
275 }
276
277 /// Return the element's `outerHTML`.
278 ///
279 ///
280 /// # Errors
281 ///
282 /// invalidated.
283 pub async fn outer_html(&self) -> Result<String> {
284 timeout(self.cdp_timeout, self.element.outer_html())
285 .await
286 .map_err(|_| BrowserError::Timeout {
287 operation: "NodeHandle::outer_html".to_string(),
288 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
289 })?
290 .map_err(|e| self.cdp_err_or_stale(&e, "outer_html"))
291 .map(Option::unwrap_or_default)
292 }
293
294 ///
295 /// Executes a single `Runtime.callFunctionOn` JavaScript function that
296 /// walks `parentElement` and collects tag names — no repeated CDP calls.
297 ///
298 /// ```text
299 /// ["p", "article", "body", "html"]
300 /// ```
301 ///
302 /// # Errors
303 ///
304 /// invalidated, or [`BrowserError::ScriptExecutionFailed`] when CDP
305 pub async fn ancestors(&self) -> Result<Vec<String>> {
306 let returns = timeout(
307 self.cdp_timeout,
308 self.element.call_js_fn(
309 r"function() {
310 const a = [];
311 let n = this.parentElement;
312 while (n) { a.push(n.tagName.toLowerCase()); n = n.parentElement; }
313 return a;
314 }",
315 true,
316 ),
317 )
318 .await
319 .map_err(|_| BrowserError::Timeout {
320 operation: "NodeHandle::ancestors".to_string(),
321 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
322 })?
323 .map_err(|e| self.cdp_err_or_stale(&e, "ancestors"))?;
324
325 // With returnByValue=true and an array return, CDP delivers the value
326 // as a JSON array directly — no JSON.stringify/re-parse needed.
327 // A missing or wrong-type value indicates an unexpected CDP failure.
328 let arr = returns
329 .result
330 .value
331 .as_ref()
332 .and_then(|v| v.as_array())
333 .ok_or_else(|| BrowserError::ScriptExecutionFailed {
334 script: "NodeHandle::ancestors".to_string(),
335 reason: "CDP returned no value or a non-array value for ancestors()".to_string(),
336 })?;
337
338 arr.iter()
339 .map(|v| {
340 v.as_str().map(ToString::to_string).ok_or_else(|| {
341 BrowserError::ScriptExecutionFailed {
342 script: "NodeHandle::ancestors".to_string(),
343 reason: format!("ancestor entry is not a string: {v}"),
344 }
345 })
346 })
347 .collect()
348 }
349
350 ///
351 ///
352 ///
353 /// # Errors
354 ///
355 /// invalidated, or [`BrowserError::CdpError`] on transport failure.
356 pub async fn children_matching(&self, selector: &str) -> Result<Vec<Self>> {
357 let elements = timeout(self.cdp_timeout, self.element.find_elements(selector))
358 .await
359 .map_err(|_| BrowserError::Timeout {
360 operation: "NodeHandle::children_matching".to_string(),
361 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
362 })?
363 .map_err(|e| self.cdp_err_or_stale(&e, "children_matching"))?;
364
365 let selector_arc: Arc<str> = Arc::from(selector);
366 Ok(elements
367 .into_iter()
368 .map(|el| Self {
369 element: el,
370 selector: selector_arc.clone(),
371 cdp_timeout: self.cdp_timeout,
372 page: self.page.clone(),
373 })
374 .collect())
375 }
376
377 /// Return the immediate parent element, or `None` if this element has no
378 /// parent (i.e. it is the document root).
379 ///
380 /// Issues a single `Runtime.callFunctionOn` CDP call that temporarily tags
381 /// the parent element with a unique attribute, then resolves it via a
382 /// CSS attribute selector.
383 ///
384 /// # Errors
385 ///
386 /// Returns an error if the CDP call fails or the page handle is invalidated.
387 ///
388 /// # Example
389 ///
390 /// ```no_run
391 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
392 /// use std::time::Duration;
393 ///
394 /// # async fn run() -> stygian_browser::error::Result<()> {
395 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
396 /// let handle = pool.acquire().await?;
397 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
398 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
399 /// # let nodes = page.query_selector_all("a").await?;
400 /// if let Some(parent) = nodes[0].parent().await? {
401 /// let html = parent.outer_html().await?;
402 /// println!("parent: {}", &html[..html.len().min(80)]);
403 /// }
404 /// # Ok(())
405 /// # }
406 /// ```
407 pub async fn parent(&self) -> Result<Option<Self>> {
408 let attr = format!(
409 "data-stygian-t-{}",
410 ulid::Ulid::new().to_string().to_lowercase()
411 );
412 let js = format!(
413 "function() {{ \
414 var t = this.parentElement; \
415 if (!t) {{ return false; }} \
416 t.setAttribute('{attr}', '1'); \
417 return true; \
418 }}"
419 );
420 self.call_traversal(&js, &attr, "parent").await
421 }
422
423 /// Return the next element sibling, or `None` if this element is the last
424 /// child of its parent.
425 ///
426 /// Uses `nextElementSibling` (skips text/comment nodes).
427 ///
428 /// # Errors
429 ///
430 /// invalidated.
431 ///
432 /// # Example
433 ///
434 /// ```no_run
435 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
436 /// use std::time::Duration;
437 ///
438 /// # async fn run() -> stygian_browser::error::Result<()> {
439 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
440 /// let handle = pool.acquire().await?;
441 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
442 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
443 /// # let nodes = page.query_selector_all("a").await?;
444 /// if let Some(next) = nodes[0].next_sibling().await? {
445 /// println!("next sibling: {}", next.text_content().await?);
446 /// }
447 /// # Ok(())
448 /// # }
449 /// ```
450 pub async fn next_sibling(&self) -> Result<Option<Self>> {
451 let attr = format!(
452 "data-stygian-t-{}",
453 ulid::Ulid::new().to_string().to_lowercase()
454 );
455 let js = format!(
456 "function() {{ \
457 var t = this.nextElementSibling; \
458 if (!t) {{ return false; }} \
459 t.setAttribute('{attr}', '1'); \
460 return true; \
461 }}"
462 );
463 self.call_traversal(&js, &attr, "next").await
464 }
465
466 /// Return the previous element sibling, or `None` if this element is the
467 /// first child of its parent.
468 ///
469 /// Uses `previousElementSibling` (skips text/comment nodes).
470 ///
471 /// # Errors
472 ///
473 /// invalidated.
474 ///
475 /// # Example
476 ///
477 /// ```no_run
478 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
479 /// use std::time::Duration;
480 ///
481 /// # async fn run() -> stygian_browser::error::Result<()> {
482 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
483 /// let handle = pool.acquire().await?;
484 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
485 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
486 /// # let nodes = page.query_selector_all("a").await?;
487 /// if let Some(prev) = nodes[1].previous_sibling().await? {
488 /// println!("prev sibling: {}", prev.text_content().await?);
489 /// }
490 /// # Ok(())
491 /// # }
492 /// ```
493 pub async fn previous_sibling(&self) -> Result<Option<Self>> {
494 let attr = format!(
495 "data-stygian-t-{}",
496 ulid::Ulid::new().to_string().to_lowercase()
497 );
498 let js = format!(
499 "function() {{ \
500 var t = this.previousElementSibling; \
501 if (!t) {{ return false; }} \
502 t.setAttribute('{attr}', '1'); \
503 return true; \
504 }}"
505 );
506 self.call_traversal(&js, &attr, "prev").await
507 }
508
509 /// Shared traversal implementation used by [`parent`], [`next_sibling`],
510 /// and [`previous_sibling`].
511 ///
512 /// The caller provides a JS function that:
513 /// 1. Computes the traversal target (for example, the parent, next
514 /// sibling, or previous sibling) and stores it in a local variable.
515 /// 2. If the target is non-null, sets a unique attribute (`attr_name`)
516 /// on it and returns `true`.
517 /// 3. Returns `false` when the target is null (no such neighbour).
518 ///
519 /// This helper then resolves the tagged element from the document root,
520 /// removes the temporary attribute, and wraps the result in a
521 /// `NodeHandle`.
522 ///
523 /// [`parent`]: Self::parent
524 /// [`next_sibling`]: Self::next_sibling
525 /// [`previous_sibling`]: Self::previous_sibling
526 async fn call_traversal(
527 &self,
528 js_fn: &str,
529 attr_name: &str,
530 selector_suffix: &str,
531 ) -> Result<Option<Self>> {
532 // Step 1: Run the JS that tags the target element and reports null/non-null.
533 let op_tag = format!("NodeHandle::{selector_suffix}::tag");
534 let returns = timeout(self.cdp_timeout, self.element.call_js_fn(js_fn, false))
535 .await
536 .map_err(|_| BrowserError::Timeout {
537 operation: op_tag.clone(),
538 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
539 })?
540 .map_err(|e| self.cdp_err_or_stale(&e, selector_suffix))?;
541
542 // JS returns false → no such neighbour.
543 let has_target = returns
544 .result
545 .value
546 .as_ref()
547 .and_then(serde_json::Value::as_bool)
548 .unwrap_or(false);
549 if !has_target {
550 return Ok(None);
551 }
552
553 let css = format!("[{attr_name}]");
554 let op_resolve = format!("NodeHandle::{selector_suffix}::resolve");
555 let element = timeout(self.cdp_timeout, self.page.find_element(css))
556 .await
557 .map_err(|_| BrowserError::Timeout {
558 operation: op_resolve.clone(),
559 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
560 })?
561 .map_err(|e| BrowserError::CdpError {
562 operation: op_resolve,
563 message: format!("{e:?}"),
564 })?;
565
566 // is non-fatal — it leaves a harmless stale attribute in the DOM).
567 let cleanup = format!("function() {{ this.removeAttribute('{attr_name}'); }}");
568 let _ = element.call_js_fn(cleanup, false).await;
569
570 let new_selector: Arc<str> =
571 Arc::from(format!("{}::{selector_suffix}", self.selector).as_str());
572 Ok(Some(Self {
573 element,
574 selector: new_selector,
575 cdp_timeout: self.cdp_timeout,
576 page: self.page.clone(),
577 }))
578 }
579
580 /// (when the remote object reference has been invalidated) or
581 fn cdp_err_or_stale(
582 &self,
583 err: &chromiumoxide::error::CdpError,
584 operation: &str,
585 ) -> BrowserError {
586 let msg = format!("{err:?}");
587 if msg.contains("Cannot find object with id")
588 || msg.contains("context with specified id")
589 || msg.contains("Cannot find context")
590 {
591 BrowserError::StaleNode {
592 selector: self.selector.to_string(),
593 }
594 } else {
595 BrowserError::CdpError {
596 operation: operation.to_string(),
597 message: msg,
598 }
599 }
600 }
601}
602
603// ─── PageHandle ───────────────────────────────────────────────────────────────
604
605///
606///
607/// # Example
608///
609/// ```no_run
610/// use stygian_browser::{BrowserPool, BrowserConfig};
611/// use stygian_browser::page::WaitUntil;
612/// use std::time::Duration;
613///
614/// # async fn run() -> stygian_browser::error::Result<()> {
615/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
616/// let handle = pool.acquire().await?;
617/// let mut page = handle.browser().expect("valid browser").new_page().await?;
618/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
619/// let html = page.content().await?;
620/// drop(page); // closes the tab
621/// handle.release().await;
622/// # Ok(())
623/// # }
624/// ```
625pub struct PageHandle {
626 page: Page,
627 cdp_timeout: Duration,
628 /// HTTP status code of the most recent main-frame navigation, or `0` if not
629 last_status_code: Arc<AtomicU16>,
630 /// Background task processing `Fetch.requestPaused` events. Aborted and
631 /// replaced each time `set_resource_filter` is called.
632 resource_filter_task: Option<tokio::task::JoinHandle<()>>,
633}
634
635impl PageHandle {
636 /// Wrap a raw chromiumoxide [`Page`] in a handle.
637 pub(crate) fn new(page: Page, cdp_timeout: Duration) -> Self {
638 Self {
639 page,
640 cdp_timeout,
641 last_status_code: Arc::new(AtomicU16::new(0)),
642 resource_filter_task: None,
643 }
644 }
645
646 ///
647 /// # Errors
648 ///
649 /// the CDP call fails.
650 pub async fn navigate(
651 &mut self,
652 url: &str,
653 condition: WaitUntil,
654 nav_timeout: Duration,
655 ) -> Result<()> {
656 self.setup_status_capture().await;
657 timeout(
658 nav_timeout,
659 self.navigate_inner(url, condition, nav_timeout),
660 )
661 .await
662 .map_err(|_| BrowserError::NavigationFailed {
663 url: url.to_string(),
664 reason: format!("navigation timed out after {nav_timeout:?}"),
665 })?
666 }
667
668 /// Reset the last status code and wire up the `Network.responseReceived`
669 /// so that a missing network domain never blocks navigation.
670 async fn setup_status_capture(&self) {
671 use chromiumoxide::cdp::browser_protocol::network::{
672 EventResponseReceived, ResourceType as NetworkResourceType,
673 };
674 use futures::StreamExt;
675
676 // Reset so a stale code is not returned if the new navigation fails
677 self.last_status_code.store(0, Ordering::Release);
678
679 let page_for_listener = self.page.clone();
680 let status_capture = Arc::clone(&self.last_status_code);
681 match page_for_listener
682 .event_listener::<EventResponseReceived>()
683 .await
684 {
685 Ok(mut stream) => {
686 tokio::spawn(async move {
687 while let Some(event) = stream.next().await {
688 if event.r#type == NetworkResourceType::Document {
689 let code = u16::try_from(event.response.status).unwrap_or(0);
690 if code > 0 {
691 status_capture.store(code, Ordering::Release);
692 }
693 break;
694 }
695 }
696 });
697 }
698 Err(e) => warn!("status-code capture unavailable: {e}"),
699 }
700 }
701
702 /// described in issue #7.
703 async fn navigate_inner(
704 &self,
705 url: &str,
706 condition: WaitUntil,
707 nav_timeout: Duration,
708 ) -> Result<()> {
709 use chromiumoxide::cdp::browser_protocol::page::{
710 EventDomContentEventFired, EventLoadEventFired,
711 };
712 use futures::StreamExt;
713
714 let url_owned = url.to_string();
715
716 let mut dom_events = match &condition {
717 WaitUntil::DomContentLoaded => Some(
718 self.page
719 .event_listener::<EventDomContentEventFired>()
720 .await
721 .map_err(|e| BrowserError::NavigationFailed {
722 url: url_owned.clone(),
723 reason: format!("{e:?}"),
724 })?,
725 ),
726 _ => None,
727 };
728
729 let mut load_events = match &condition {
730 WaitUntil::NetworkIdle => Some(
731 self.page
732 .event_listener::<EventLoadEventFired>()
733 .await
734 .map_err(|e| BrowserError::NavigationFailed {
735 url: url_owned.clone(),
736 reason: e.to_string(),
737 })?,
738 ),
739 _ => None,
740 };
741
742 let inflight = if matches!(condition, WaitUntil::NetworkIdle) {
743 Some(self.subscribe_inflight_counter().await)
744 } else {
745 None
746 };
747
748 self.page
749 .goto(url)
750 .await
751 .map_err(|e| BrowserError::NavigationFailed {
752 url: url_owned.clone(),
753 reason: e.to_string(),
754 })?;
755
756 match &condition {
757 WaitUntil::DomContentLoaded => {
758 if let Some(ref mut events) = dom_events {
759 let _ = events.next().await;
760 }
761 }
762 WaitUntil::NetworkIdle => {
763 if let Some(ref mut events) = load_events {
764 let _ = events.next().await;
765 }
766 if let Some(ref counter) = inflight {
767 Self::wait_network_idle(counter).await;
768 }
769 }
770 WaitUntil::Selector(css) => {
771 self.wait_for_selector(css, nav_timeout).await?;
772 }
773 }
774 Ok(())
775 }
776
777 /// Spawn three detached tasks that maintain a signed in-flight request
778 /// counter via `Network.requestWillBeSent` (+1) and
779 /// `Network.loadingFinished`/`Network.loadingFailed` (−1 each).
780 async fn subscribe_inflight_counter(&self) -> Arc<std::sync::atomic::AtomicI32> {
781 use std::sync::atomic::AtomicI32;
782
783 use chromiumoxide::cdp::browser_protocol::network::{
784 EventLoadingFailed, EventLoadingFinished, EventRequestWillBeSent,
785 };
786 use futures::StreamExt;
787
788 let counter: Arc<AtomicI32> = Arc::new(AtomicI32::new(0));
789 let pairs: [(Arc<AtomicI32>, i32); 3] = [
790 (Arc::clone(&counter), 1),
791 (Arc::clone(&counter), -1),
792 (Arc::clone(&counter), -1),
793 ];
794 let [p1, p2, p3] = [self.page.clone(), self.page.clone(), self.page.clone()];
795
796 macro_rules! spawn_tracker {
797 ($page:expr, $event:ty, $c:expr, $delta:expr) => {
798 match $page.event_listener::<$event>().await {
799 Ok(mut s) => {
800 let c = $c;
801 let d = $delta;
802 tokio::spawn(async move {
803 while s.next().await.is_some() {
804 c.fetch_add(d, Ordering::Relaxed);
805 }
806 });
807 }
808 Err(e) => warn!("network-idle tracker unavailable: {e}"),
809 }
810 };
811 }
812
813 let [(c1, d1), (c2, d2), (c3, d3)] = pairs;
814 spawn_tracker!(p1, EventRequestWillBeSent, c1, d1);
815 spawn_tracker!(p2, EventLoadingFinished, c2, d2);
816 spawn_tracker!(p3, EventLoadingFailed, c3, d3);
817
818 counter
819 }
820
821 async fn wait_network_idle(counter: &Arc<std::sync::atomic::AtomicI32>) {
822 const IDLE_THRESHOLD: i32 = 2;
823 const SETTLE: Duration = Duration::from_millis(500);
824 loop {
825 if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
826 tokio::time::sleep(SETTLE).await;
827 if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
828 break;
829 }
830 } else {
831 tokio::time::sleep(Duration::from_millis(50)).await;
832 }
833 }
834 }
835
836 ///
837 /// # Errors
838 ///
839 /// within the given timeout.
840 pub async fn wait_for_selector(&self, selector: &str, wait_timeout: Duration) -> Result<()> {
841 let selector_owned = selector.to_string();
842 let poll = async {
843 loop {
844 if self.page.find_element(selector_owned.clone()).await.is_ok() {
845 return Ok(());
846 }
847 tokio::time::sleep(Duration::from_millis(100)).await;
848 }
849 };
850
851 timeout(wait_timeout, poll)
852 .await
853 .map_err(|_| BrowserError::NavigationFailed {
854 url: String::new(),
855 reason: format!("selector '{selector_owned}' not found within {wait_timeout:?}"),
856 })?
857 }
858
859 ///
860 /// Enables `Fetch` interception and spawns a background task that continues
861 /// allowed requests and fails blocked ones with `BlockedByClient`. Any
862 /// previously set filter task is cancelled first.
863 ///
864 /// # Errors
865 ///
866 pub async fn set_resource_filter(&mut self, filter: ResourceFilter) -> Result<()> {
867 use chromiumoxide::cdp::browser_protocol::fetch::{
868 ContinueRequestParams, EnableParams, EventRequestPaused, FailRequestParams,
869 RequestPattern,
870 };
871 use chromiumoxide::cdp::browser_protocol::network::ErrorReason;
872 use futures::StreamExt as _;
873
874 if filter.is_empty() {
875 return Ok(());
876 }
877
878 // Cancel any previously running filter task.
879 if let Some(task) = self.resource_filter_task.take() {
880 task.abort();
881 }
882
883 let pattern = RequestPattern::builder().url_pattern("*").build();
884 let params = EnableParams::builder()
885 .patterns(vec![pattern])
886 .handle_auth_requests(false)
887 .build();
888
889 timeout(self.cdp_timeout, self.page.execute::<EnableParams>(params))
890 .await
891 .map_err(|_| BrowserError::Timeout {
892 operation: "Fetch.enable".to_string(),
893 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
894 })?
895 .map_err(|e| BrowserError::CdpError {
896 operation: "Fetch.enable".to_string(),
897 message: e.to_string(),
898 })?;
899
900 // is never blocked. Without this handler Chrome holds every intercepted
901 // request indefinitely and the page hangs.
902 let mut events = self
903 .page
904 .event_listener::<EventRequestPaused>()
905 .await
906 .map_err(|e| BrowserError::CdpError {
907 operation: "Fetch.requestPaused subscribe".to_string(),
908 message: e.to_string(),
909 })?;
910
911 let page = self.page.clone();
912 debug!("Resource filter active: {:?}", filter);
913 let task = tokio::spawn(async move {
914 while let Some(event) = events.next().await {
915 let request_id = event.request_id.clone();
916 if filter.should_block(event.resource_type.as_ref()) {
917 let params = FailRequestParams::new(request_id, ErrorReason::BlockedByClient);
918 let _ = page.execute(params).await;
919 } else {
920 let _ = page.execute(ContinueRequestParams::new(request_id)).await;
921 }
922 }
923 });
924
925 self.resource_filter_task = Some(task);
926 Ok(())
927 }
928
929 /// Return the current page URL (post-navigation, post-redirect).
930 ///
931 /// internally by [`save_cookies`](Self::save_cookies); no extra network
932 /// request is made. Returns an empty string if the URL is not yet set
933 ///
934 /// # Errors
935 ///
936 /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
937 ///
938 /// # Example
939 ///
940 /// ```no_run
941 /// use stygian_browser::{BrowserPool, BrowserConfig};
942 /// use stygian_browser::page::WaitUntil;
943 /// use std::time::Duration;
944 ///
945 /// # async fn run() -> stygian_browser::error::Result<()> {
946 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
947 /// let handle = pool.acquire().await?;
948 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
949 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
950 /// let url = page.url().await?;
951 /// println!("Final URL after redirects: {url}");
952 /// # Ok(())
953 /// # }
954 /// ```
955 pub async fn url(&self) -> Result<String> {
956 timeout(self.cdp_timeout, self.page.url())
957 .await
958 .map_err(|_| BrowserError::Timeout {
959 operation: "page.url".to_string(),
960 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
961 })?
962 .map_err(|e| BrowserError::CdpError {
963 operation: "page.url".to_string(),
964 message: e.to_string(),
965 })
966 .map(Option::unwrap_or_default)
967 }
968
969 /// Return the HTTP status code of the most recent main-frame navigation.
970 ///
971 /// The status is captured from the `Network.responseReceived` CDP event
972 /// wired up inside [`navigate`](Self::navigate), so it reflects the
973 /// *final* response after any server-side redirects.
974 ///
975 /// navigations, when [`navigate`](Self::navigate) has not yet been called,
976 /// or if the network event subscription failed.
977 ///
978 /// # Errors
979 ///
980 ///
981 /// # Example
982 ///
983 /// ```no_run
984 /// use stygian_browser::{BrowserPool, BrowserConfig};
985 /// use stygian_browser::page::WaitUntil;
986 /// use std::time::Duration;
987 ///
988 /// # async fn run() -> stygian_browser::error::Result<()> {
989 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
990 /// let handle = pool.acquire().await?;
991 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
992 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
993 /// if let Some(code) = page.status_code()? {
994 /// println!("HTTP {code}");
995 /// }
996 /// # Ok(())
997 /// # }
998 /// ```
999 pub fn status_code(&self) -> Result<Option<u16>> {
1000 let code = self.last_status_code.load(Ordering::Acquire);
1001 Ok(if code == 0 { None } else { Some(code) })
1002 }
1003
1004 /// Return the page's `<title>` text.
1005 ///
1006 /// # Errors
1007 ///
1008 pub async fn title(&self) -> Result<String> {
1009 timeout(self.cdp_timeout, self.page.get_title())
1010 .await
1011 .map_err(|_| BrowserError::Timeout {
1012 operation: "get_title".to_string(),
1013 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1014 })?
1015 .map_err(|e| BrowserError::ScriptExecutionFailed {
1016 script: "document.title".to_string(),
1017 reason: e.to_string(),
1018 })
1019 .map(Option::unwrap_or_default)
1020 }
1021
1022 /// Return the page's full outer HTML.
1023 ///
1024 /// # Errors
1025 ///
1026 pub async fn content(&self) -> Result<String> {
1027 timeout(self.cdp_timeout, self.page.content())
1028 .await
1029 .map_err(|_| BrowserError::Timeout {
1030 operation: "page.content".to_string(),
1031 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1032 })?
1033 .map_err(|e| BrowserError::ScriptExecutionFailed {
1034 script: "document.documentElement.outerHTML".to_string(),
1035 reason: e.to_string(),
1036 })
1037 }
1038
1039 /// lightweight [`NodeHandle`]s backed by CDP `RemoteObjectId`s.
1040 ///
1041 /// No HTML serialisation occurs — the browser's in-memory DOM is queried
1042 /// directly over the CDP connection, eliminating the `page.content()` +
1043 /// `scraper::Html::parse_document` round-trip.
1044 ///
1045 ///
1046 /// # Errors
1047 ///
1048 /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
1049 ///
1050 /// # Example
1051 ///
1052 /// ```no_run
1053 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1054 /// use std::time::Duration;
1055 ///
1056 /// # async fn run() -> stygian_browser::error::Result<()> {
1057 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1058 /// let handle = pool.acquire().await?;
1059 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1060 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1061 /// # let nodes = page.query_selector_all("div[data-ux]").await?;
1062 /// # for node in &nodes {
1063 /// let ux_type = node.attr("data-ux").await?;
1064 /// let text = node.text_content().await?;
1065 /// println!("{ux_type:?}: {text}");
1066 /// # }
1067 /// # Ok(())
1068 /// # }
1069 /// ```
1070 pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<NodeHandle>> {
1071 let elements = timeout(self.cdp_timeout, self.page.find_elements(selector))
1072 .await
1073 .map_err(|_| BrowserError::Timeout {
1074 operation: "PageHandle::query_selector_all".to_string(),
1075 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1076 })?
1077 .map_err(|e| BrowserError::CdpError {
1078 operation: "PageHandle::query_selector_all".to_string(),
1079 message: e.to_string(),
1080 })?;
1081
1082 let selector_arc: Arc<str> = Arc::from(selector);
1083 Ok(elements
1084 .into_iter()
1085 .map(|el| NodeHandle {
1086 element: el,
1087 selector: selector_arc.clone(),
1088 cdp_timeout: self.cdp_timeout,
1089 page: self.page.clone(),
1090 })
1091 .collect())
1092 }
1093
1094 /// Evaluate arbitrary JavaScript and return the result as `T`.
1095 ///
1096 /// # Errors
1097 ///
1098 /// deserialization error.
1099 pub async fn eval<T: serde::de::DeserializeOwned>(&self, script: &str) -> Result<T> {
1100 let script_owned = script.to_string();
1101 timeout(self.cdp_timeout, self.page.evaluate(script))
1102 .await
1103 .map_err(|_| BrowserError::Timeout {
1104 operation: "page.evaluate".to_string(),
1105 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1106 })?
1107 .map_err(|e| BrowserError::ScriptExecutionFailed {
1108 script: script_owned.clone(),
1109 reason: e.to_string(),
1110 })?
1111 .into_value::<T>()
1112 .map_err(|e| BrowserError::ScriptExecutionFailed {
1113 script: script_owned,
1114 reason: e.to_string(),
1115 })
1116 }
1117
1118 ///
1119 /// # Errors
1120 ///
1121 pub async fn save_cookies(
1122 &self,
1123 ) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::Cookie>> {
1124 use chromiumoxide::cdp::browser_protocol::network::GetCookiesParams;
1125
1126 let url = self
1127 .page
1128 .url()
1129 .await
1130 .map_err(|e| BrowserError::CdpError {
1131 operation: "page.url".to_string(),
1132 message: e.to_string(),
1133 })?
1134 .unwrap_or_default();
1135
1136 timeout(
1137 self.cdp_timeout,
1138 self.page
1139 .execute(GetCookiesParams::builder().urls(vec![url]).build()),
1140 )
1141 .await
1142 .map_err(|_| BrowserError::Timeout {
1143 operation: "Network.getCookies".to_string(),
1144 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1145 })?
1146 .map_err(|e| BrowserError::CdpError {
1147 operation: "Network.getCookies".to_string(),
1148 message: e.to_string(),
1149 })
1150 .map(|r| r.cookies.clone())
1151 }
1152
1153 ///
1154 /// [`SessionSnapshot`][crate::session::SessionSnapshot] and without
1155 /// requiring a direct `chromiumoxide` dependency in calling code.
1156 ///
1157 /// Individual cookie failures are logged as warnings and do not abort the
1158 /// remaining cookies.
1159 ///
1160 /// # Errors
1161 ///
1162 /// call exceeds `cdp_timeout`.
1163 ///
1164 /// # Example
1165 ///
1166 /// ```no_run
1167 /// use stygian_browser::{BrowserPool, BrowserConfig};
1168 /// use stygian_browser::session::SessionCookie;
1169 /// use std::time::Duration;
1170 ///
1171 /// # async fn run() -> stygian_browser::error::Result<()> {
1172 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1173 /// let handle = pool.acquire().await?;
1174 /// let page = handle.browser().expect("valid browser").new_page().await?;
1175 /// let cookies = vec![SessionCookie {
1176 /// name: "session".to_string(),
1177 /// value: "abc123".to_string(),
1178 /// domain: ".example.com".to_string(),
1179 /// path: "/".to_string(),
1180 /// expires: -1.0,
1181 /// http_only: true,
1182 /// secure: true,
1183 /// same_site: "Lax".to_string(),
1184 /// }];
1185 /// page.inject_cookies(&cookies).await?;
1186 /// # Ok(())
1187 /// # }
1188 /// ```
1189 pub async fn inject_cookies(&self, cookies: &[crate::session::SessionCookie]) -> Result<()> {
1190 use chromiumoxide::cdp::browser_protocol::network::SetCookieParams;
1191
1192 for cookie in cookies {
1193 let params = match SetCookieParams::builder()
1194 .name(cookie.name.clone())
1195 .value(cookie.value.clone())
1196 .domain(cookie.domain.clone())
1197 .path(cookie.path.clone())
1198 .http_only(cookie.http_only)
1199 .secure(cookie.secure)
1200 .build()
1201 {
1202 Ok(p) => p,
1203 Err(e) => {
1204 warn!(cookie = %cookie.name, error = %e, "Failed to build cookie params");
1205 continue;
1206 }
1207 };
1208
1209 match timeout(self.cdp_timeout, self.page.execute(params)).await {
1210 Err(_) => {
1211 warn!(
1212 cookie = %cookie.name,
1213 timeout_ms = self.cdp_timeout.as_millis(),
1214 "Timed out injecting cookie"
1215 );
1216 }
1217 Ok(Err(e)) => {
1218 warn!(cookie = %cookie.name, error = %e, "Failed to inject cookie");
1219 }
1220 Ok(Ok(_)) => {}
1221 }
1222 }
1223
1224 debug!(count = cookies.len(), "Cookies injected");
1225 Ok(())
1226 }
1227
1228 /// Capture a screenshot of the current page as PNG bytes.
1229 ///
1230 /// them in-memory.
1231 ///
1232 /// # Errors
1233 ///
1234 /// command fails, or [`BrowserError::Timeout`] if it exceeds
1235 /// `cdp_timeout`.
1236 ///
1237 /// # Example
1238 ///
1239 /// ```no_run
1240 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1241 /// use std::{time::Duration, fs};
1242 ///
1243 /// # async fn run() -> stygian_browser::error::Result<()> {
1244 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1245 /// let handle = pool.acquire().await?;
1246 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1247 /// let png = page.screenshot().await?;
1248 /// fs::write("screenshot.png", &png).unwrap();
1249 /// # Ok(())
1250 /// # }
1251 /// ```
1252 pub async fn screenshot(&self) -> Result<Vec<u8>> {
1253 use chromiumoxide::page::ScreenshotParams;
1254
1255 let params = ScreenshotParams::builder().full_page(true).build();
1256
1257 timeout(self.cdp_timeout, self.page.screenshot(params))
1258 .await
1259 .map_err(|_| BrowserError::Timeout {
1260 operation: "Page.captureScreenshot".to_string(),
1261 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1262 })?
1263 .map_err(|e| BrowserError::CdpError {
1264 operation: "Page.captureScreenshot".to_string(),
1265 message: e.to_string(),
1266 })
1267 }
1268
1269 /// Borrow the underlying chromiumoxide [`Page`].
1270 pub const fn inner(&self) -> &Page {
1271 &self.page
1272 }
1273
1274 /// Close this page (tab).
1275 ///
1276 pub async fn close(self) -> Result<()> {
1277 timeout(Duration::from_secs(5), self.page.clone().close())
1278 .await
1279 .map_err(|_| BrowserError::Timeout {
1280 operation: "page.close".to_string(),
1281 duration_ms: 5000,
1282 })?
1283 .map_err(|e| BrowserError::CdpError {
1284 operation: "page.close".to_string(),
1285 message: e.to_string(),
1286 })
1287 }
1288}
1289
1290// ─── Stealth diagnostics ──────────────────────────────────────────────────────
1291
1292#[cfg(feature = "stealth")]
1293impl PageHandle {
1294 /// Run all built-in stealth detection checks against the current page.
1295 ///
1296 /// Iterates [`crate::diagnostic::all_checks`], evaluates each check's
1297 /// JavaScript via CDP `Runtime.evaluate`, and returns an aggregate
1298 /// [`crate::diagnostic::DiagnosticReport`].
1299 ///
1300 /// recorded as failing checks and do **not** abort the whole run.
1301 ///
1302 /// # Errors
1303 ///
1304 /// Individual check failures are captured in the report.
1305 ///
1306 /// # Example
1307 ///
1308 /// ```no_run
1309 /// # async fn run() -> stygian_browser::error::Result<()> {
1310 /// use stygian_browser::{BrowserPool, BrowserConfig};
1311 /// use stygian_browser::page::WaitUntil;
1312 /// use std::time::Duration;
1313 ///
1314 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1315 /// let handle = pool.acquire().await?;
1316 /// let browser = handle.browser().expect("valid browser");
1317 /// let mut page = browser.new_page().await?;
1318 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(10)).await?;
1319 ///
1320 /// let report = page.verify_stealth().await?;
1321 /// println!("Stealth: {}/{} checks passed", report.passed_count, report.checks.len());
1322 /// # for failure in report.failures() {
1323 /// eprintln!(" FAIL {}: {}", failure.description, failure.details);
1324 /// # }
1325 /// # Ok(())
1326 /// # }
1327 /// ```
1328 pub async fn verify_stealth(&self) -> Result<crate::diagnostic::DiagnosticReport> {
1329 use crate::diagnostic::{CheckResult, DiagnosticReport, all_checks, all_limitation_probes};
1330
1331 let mut results: Vec<CheckResult> = Vec::new();
1332 let mut known_limitations = Vec::new();
1333
1334 for check in all_checks() {
1335 let result = match self.eval::<String>(check.script).await {
1336 Ok(json) => check.parse_output(&json),
1337 Err(e) => {
1338 tracing::warn!(
1339 check = ?check.id,
1340 error = %e,
1341 "stealth check script failed during evaluation"
1342 );
1343 CheckResult {
1344 id: check.id,
1345 description: check.description.to_string(),
1346 passed: false,
1347 details: format!("script error: {e}"),
1348 }
1349 }
1350 };
1351 tracing::debug!(
1352 check = ?result.id,
1353 passed = result.passed,
1354 details = %result.details,
1355 "stealth check result"
1356 );
1357 results.push(result);
1358 }
1359
1360 for probe in all_limitation_probes() {
1361 let limitation = match self.eval::<String>(probe.script).await {
1362 Ok(json) => probe.parse_output(&json),
1363 Err(error) => Some(crate::diagnostic::KnownLimitation {
1364 id: probe.id,
1365 description: probe.description.to_string(),
1366 details: format!("script error: {error}"),
1367 }),
1368 };
1369 if let Some(limitation) = limitation {
1370 tracing::debug!(
1371 limitation = ?limitation.id,
1372 details = %limitation.details,
1373 "stealth limitation observed"
1374 );
1375 known_limitations.push(limitation);
1376 }
1377 }
1378
1379 Ok(DiagnosticReport::new(results).with_known_limitations(known_limitations))
1380 }
1381
1382 /// Run stealth checks and attach transport diagnostics (JA3/JA4/HTTP3).
1383 ///
1384 pub async fn verify_stealth_with_transport(
1385 &self,
1386 observed: Option<crate::diagnostic::TransportObservations>,
1387 ) -> Result<crate::diagnostic::DiagnosticReport> {
1388 let report = self.verify_stealth().await?;
1389
1390 let user_agent = match self.eval::<String>("navigator.userAgent").await {
1391 Ok(ua) => ua,
1392 Err(e) => {
1393 tracing::warn!(error = %e, "failed to read navigator.userAgent for transport diagnostics");
1394 String::new()
1395 }
1396 };
1397
1398 let transport = crate::diagnostic::TransportDiagnostic::from_user_agent_and_observations(
1399 &user_agent,
1400 observed.as_ref(),
1401 );
1402
1403 Ok(report.with_transport(transport))
1404 }
1405}
1406
1407// ─── extract feature ─────────────────────────────────────────────────────────
1408
1409#[cfg(feature = "extract")]
1410impl PageHandle {
1411 ///
1412 ///
1413 /// All per-node extractions are driven concurrently via
1414 /// [`futures::future::try_join_all`].
1415 ///
1416 /// # Errors
1417 ///
1418 /// fails, or [`BrowserError::ExtractionFailed`] if any field extraction
1419 /// fails.
1420 ///
1421 /// # Example
1422 ///
1423 /// ```ignore
1424 /// use stygian_browser::extract::Extract;
1425 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1426 /// use std::time::Duration;
1427 ///
1428 /// #[derive(Extract)]
1429 /// struct Link {
1430 /// href: Option<String>,
1431 /// }
1432 ///
1433 /// # async fn run() -> stygian_browser::error::Result<()> {
1434 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1435 /// let handle = pool.acquire().await?;
1436 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1437 /// page.navigate(
1438 /// "https://example.com",
1439 /// WaitUntil::DomContentLoaded,
1440 /// Duration::from_secs(30),
1441 /// ).await?;
1442 /// let links: Vec<Link> = page.extract_all::<Link>("nav li").await?;
1443 /// # Ok(())
1444 /// # }
1445 /// ```
1446 pub async fn extract_all<T>(&self, selector: &str) -> Result<Vec<T>>
1447 where
1448 T: crate::extract::Extractable,
1449 {
1450 use futures::future::try_join_all;
1451
1452 let nodes = self.query_selector_all(selector).await?;
1453 try_join_all(nodes.iter().map(|n| T::extract_from(n)))
1454 .await
1455 .map_err(BrowserError::ExtractionFailed)
1456 }
1457
1458 /// Try each selector in `selectors` in order and return the extracted
1459 /// results from the **first** selector that matches at least one node.
1460 ///
1461 /// This is useful when a page may use different markup across versions or
1462 /// A/B variants — supply the preferred selector first and progressively
1463 /// wider fallbacks afterwards.
1464 ///
1465 /// Returns an empty `Vec` only when *all* selectors match zero nodes
1466 /// (i.e. the element is genuinely absent from the page). A non-empty
1467 /// intermediate selector result that then fails during extraction **will**
1468 /// return an error.
1469 ///
1470 /// # Errors
1471 ///
1472 /// Returns [`BrowserError::CdpError`] if the selector query fails, or
1473 /// [`BrowserError::ExtractionFailed`] if a matched node fails extraction.
1474 ///
1475 /// # Example
1476 ///
1477 /// ```ignore
1478 /// use stygian_browser::extract::Extract;
1479 ///
1480 /// #[derive(Extract)]
1481 /// struct Headline { title: String }
1482 ///
1483 /// # async fn run(page: &stygian_browser::PageHandle) -> stygian_browser::error::Result<()> {
1484 /// // Try modern selector first, fall back to legacy markup.
1485 /// let items = page
1486 /// .extract_all_with_fallback::<Headline>(&["h2.headline", "h2.title", "h2"])
1487 /// .await?;
1488 /// # Ok(())
1489 /// # }
1490 /// ```
1491 pub async fn extract_all_with_fallback<T>(&self, selectors: &[&str]) -> Result<Vec<T>>
1492 where
1493 T: crate::extract::Extractable,
1494 {
1495 use futures::future::try_join_all;
1496
1497 for &selector in selectors {
1498 let nodes = self.query_selector_all(selector).await?;
1499 if nodes.is_empty() {
1500 continue;
1501 }
1502 return try_join_all(nodes.iter().map(|n| T::extract_from(n)))
1503 .await
1504 .map_err(BrowserError::ExtractionFailed);
1505 }
1506
1507 Ok(vec![])
1508 }
1509
1510 /// Extract from every node matching `selector`, **skipping** nodes where
1511 /// a required field is absent (i.e. [`ExtractionError::Missing`]).
1512 ///
1513 /// Unlike [`extract_all`], this method is lenient about structural
1514 /// mismatches: nodes that fail with [`ExtractionError::Missing`] are
1515 /// silently dropped from the result set. All other extraction errors
1516 /// (CDP failures, stale nodes, nested errors) still propagate as hard
1517 /// failures.
1518 ///
1519 /// This is useful when scraping heterogeneous lists where some items
1520 /// lack an optional field that your struct treats as required.
1521 ///
1522 /// [`extract_all`]: Self::extract_all
1523 /// [`ExtractionError::Missing`]: crate::extract::ExtractionError::Missing
1524 ///
1525 /// # Errors
1526 ///
1527 /// Returns [`BrowserError::CdpError`] if the selector query fails, or
1528 /// [`BrowserError::ExtractionFailed`] for non-`Missing` extraction errors.
1529 ///
1530 /// # Example
1531 ///
1532 /// ```ignore
1533 /// use stygian_browser::extract::Extract;
1534 ///
1535 /// #[derive(Extract)]
1536 /// struct Price { amount: String }
1537 ///
1538 /// # async fn run(page: &stygian_browser::PageHandle) -> stygian_browser::error::Result<()> {
1539 /// // Products without a price tag are silently skipped.
1540 /// let prices = page.extract_resilient::<Price>(".product").await?;
1541 /// # Ok(())
1542 /// # }
1543 /// ```
1544 pub async fn extract_resilient<T>(&self, selector: &str) -> Result<Vec<T>>
1545 where
1546 T: crate::extract::Extractable,
1547 {
1548 use crate::extract::ExtractionError;
1549
1550 let nodes = self.query_selector_all(selector).await?;
1551 let mut results = Vec::with_capacity(nodes.len());
1552
1553 for node in &nodes {
1554 match T::extract_from(node).await {
1555 Ok(item) => results.push(item),
1556 Err(ExtractionError::Missing { .. }) => {
1557 tracing::debug!(
1558 selector,
1559 "extract_resilient: skipping node with missing required field"
1560 );
1561 }
1562 Err(e) => return Err(BrowserError::ExtractionFailed(e)),
1563 }
1564 }
1565
1566 Ok(results)
1567 }
1568}
1569
1570// ─── similarity feature ──────────────────────────────────────────────────────
1571
1572#[cfg(feature = "similarity")]
1573impl NodeHandle {
1574 /// node.
1575 ///
1576 /// Issues a single `Runtime.callFunctionOn` JS eval that extracts the tag,
1577 /// class list, attribute names, and body-depth in one round-trip.
1578 ///
1579 /// # Errors
1580 ///
1581 /// invalidated, or [`BrowserError::ScriptExecutionFailed`] if the script
1582 /// produces unexpected output.
1583 pub async fn fingerprint(&self) -> Result<crate::similarity::ElementFingerprint> {
1584 const JS: &str = r"function() {
1585 var el = this;
1586 var tag = el.tagName.toLowerCase();
1587 var classes = Array.prototype.slice.call(el.classList).sort();
1588 var attrNames = Array.prototype.slice.call(el.attributes)
1589 .map(function(a) { return a.name; })
1590 .filter(function(n) { return n !== 'class' && n !== 'id'; })
1591 .sort();
1592 var depth = 0;
1593 var n = el.parentElement;
1594 while (n && n.tagName.toLowerCase() !== 'body') { depth++; n = n.parentElement; }
1595 return JSON.stringify({ tag: tag, classes: classes, attrNames: attrNames, depth: depth });
1596}";
1597
1598 let returns = tokio::time::timeout(self.cdp_timeout, self.element.call_js_fn(JS, true))
1599 .await
1600 .map_err(|_| BrowserError::Timeout {
1601 operation: "NodeHandle::fingerprint".to_string(),
1602 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1603 })?
1604 .map_err(|e| self.cdp_err_or_stale(&e, "fingerprint"))?;
1605
1606 let json_str = returns
1607 .result
1608 .value
1609 .as_ref()
1610 .and_then(|v| v.as_str())
1611 .ok_or_else(|| BrowserError::ScriptExecutionFailed {
1612 script: "NodeHandle::fingerprint".to_string(),
1613 reason: "CDP returned no string value from fingerprint script".to_string(),
1614 })?;
1615
1616 serde_json::from_str::<crate::similarity::ElementFingerprint>(json_str).map_err(|e| {
1617 BrowserError::ScriptExecutionFailed {
1618 script: "NodeHandle::fingerprint".to_string(),
1619 reason: format!("failed to deserialise fingerprint JSON: {e}"),
1620 }
1621 })
1622 }
1623}
1624
1625#[cfg(feature = "similarity")]
1626impl PageHandle {
1627 /// `reference`, scored by [`crate::similarity::SimilarityConfig`].
1628 ///
1629 /// [`NodeHandle::fingerprint`]), then fingerprints every candidate returned
1630 /// [`crate::similarity::jaccard_weighted`] score exceeds
1631 /// `config.threshold`. Results are ordered by score descending.
1632 ///
1633 /// # Example
1634 ///
1635 /// ```no_run
1636 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1637 /// use stygian_browser::similarity::SimilarityConfig;
1638 /// use std::time::Duration;
1639 ///
1640 /// # async fn run() -> stygian_browser::error::Result<()> {
1641 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1642 /// let handle = pool.acquire().await?;
1643 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1644 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1645 ///
1646 /// # let nodes = page.query_selector_all("h1").await?;
1647 /// # let reference = nodes.into_iter().next().ok_or(stygian_browser::error::BrowserError::StaleNode { selector: "h1".to_string() })?;
1648 /// let similar = page.find_similar(&reference, SimilarityConfig::default()).await?;
1649 /// # for m in &similar {
1650 /// println!("score={:.2}", m.score);
1651 /// # }
1652 /// # Ok(())
1653 /// # }
1654 /// ```
1655 ///
1656 /// # Errors
1657 ///
1658 /// [`BrowserError::ScriptExecutionFailed`] if a scoring script fails.
1659 pub async fn find_similar(
1660 &self,
1661 reference: &NodeHandle,
1662 config: crate::similarity::SimilarityConfig,
1663 ) -> Result<Vec<crate::similarity::SimilarMatch>> {
1664 use crate::similarity::{SimilarMatch, jaccard_weighted};
1665
1666 let ref_fp = reference.fingerprint().await?;
1667 let candidates = self.query_selector_all("*").await?;
1668
1669 let mut matches: Vec<SimilarMatch> = Vec::new();
1670 for node in candidates {
1671 if let Ok(cand_fp) = node.fingerprint().await {
1672 let score = jaccard_weighted(&ref_fp, &cand_fp);
1673 if score >= config.threshold {
1674 matches.push(SimilarMatch { node, score });
1675 }
1676 }
1677 // Stale / detached nodes are silently skipped.
1678 }
1679
1680 matches.sort_by(|a, b| {
1681 b.score
1682 .partial_cmp(&a.score)
1683 .unwrap_or(std::cmp::Ordering::Equal)
1684 });
1685
1686 if config.max_results > 0 {
1687 matches.truncate(config.max_results);
1688 }
1689
1690 Ok(matches)
1691 }
1692}
1693
1694impl Drop for PageHandle {
1695 fn drop(&mut self) {
1696 warn!("PageHandle dropped without explicit close(); spawning cleanup task");
1697 // chromiumoxide Page does not implement close on Drop, so we spawn
1698 // swap it out. We clone the Page handle (it's Arc-backed internally).
1699 let page = self.page.clone();
1700 tokio::spawn(async move {
1701 let _ = page.close().await;
1702 });
1703 }
1704}
1705
1706// ─── Session warmup & refresh ─────────────────────────────────────────────────
1707
1708/// Simplified, JSON-serializable wait strategy used in [`WarmupOptions`] and
1709/// [`RefreshOptions`].
1710///
1711/// This is a serialization-friendly analogue of [`WaitUntil`]. Use
1712/// [`WarmupWait::into_wait_until`] to convert before calling
1713/// [`PageHandle::navigate`].
1714#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
1715#[serde(rename_all = "snake_case")]
1716pub enum WarmupWait {
1717 /// Wait until the HTML is fully parsed (`DOMContentLoaded`). This is the
1718 /// default and works for most pages.
1719 #[default]
1720 DomContentLoaded,
1721 /// Wait until there are no more than two in-flight network requests for at
1722 /// least 500 ms after navigation.
1723 NetworkIdle,
1724}
1725
1726impl WarmupWait {
1727 /// Convert into the lower-level [`WaitUntil`] enum.
1728 #[must_use]
1729 pub const fn into_wait_until(self) -> WaitUntil {
1730 match self {
1731 Self::DomContentLoaded => WaitUntil::DomContentLoaded,
1732 Self::NetworkIdle => WaitUntil::NetworkIdle,
1733 }
1734 }
1735}
1736
1737/// Options for [`PageHandle::warmup`].
1738///
1739/// # Example
1740///
1741/// ```
1742/// use stygian_browser::page::{WarmupOptions, WarmupWait};
1743///
1744/// let opts = WarmupOptions {
1745/// url: "https://example.com".to_string(),
1746/// wait: WarmupWait::DomContentLoaded,
1747/// timeout_ms: 30_000,
1748/// stabilize_ms: 500,
1749/// };
1750/// assert_eq!(opts.timeout_ms, 30_000);
1751/// ```
1752#[derive(Debug, Clone, Serialize, Deserialize)]
1753pub struct WarmupOptions {
1754 /// The URL to navigate to during warmup.
1755 pub url: String,
1756 /// Wait strategy applied after the navigation commit (default:
1757 /// `DomContentLoaded`).
1758 #[serde(default)]
1759 pub wait: WarmupWait,
1760 /// Navigation timeout in milliseconds. Default: `30 000`.
1761 #[serde(default = "WarmupOptions::default_timeout_ms")]
1762 pub timeout_ms: u64,
1763 /// Additional pause after navigation to let dynamic resources (XHR,
1764 /// lazy-loaded images) settle, in milliseconds. `0` disables the
1765 /// stabilization step (default).
1766 #[serde(default)]
1767 pub stabilize_ms: u64,
1768}
1769
1770impl WarmupOptions {
1771 /// Returns the default navigation timeout (30 000 ms).
1772 #[must_use]
1773 pub const fn default_timeout_ms() -> u64 {
1774 30_000
1775 }
1776}
1777
1778impl Default for WarmupOptions {
1779 fn default() -> Self {
1780 Self {
1781 url: String::new(),
1782 wait: WarmupWait::DomContentLoaded,
1783 timeout_ms: Self::default_timeout_ms(),
1784 stabilize_ms: 0,
1785 }
1786 }
1787}
1788
1789/// Diagnostic report produced by [`PageHandle::warmup`].
1790///
1791/// # Example
1792///
1793/// ```
1794/// use stygian_browser::page::WarmupReport;
1795/// let report = WarmupReport {
1796/// url: "https://example.com".to_string(),
1797/// elapsed_ms: 250,
1798/// status_code: Some(200),
1799/// title: "Example Domain".to_string(),
1800/// stabilized: false,
1801/// };
1802/// assert_eq!(report.status_code, Some(200));
1803/// ```
1804#[derive(Debug, Clone, Serialize, Deserialize)]
1805pub struct WarmupReport {
1806 /// The URL that was warmed.
1807 pub url: String,
1808 /// Elapsed wall-time in milliseconds.
1809 pub elapsed_ms: u64,
1810 /// HTTP status code of the warmup navigation, if captured by the
1811 /// `Network.responseReceived` listener.
1812 pub status_code: Option<u16>,
1813 /// Page title after warmup navigation.
1814 pub title: String,
1815 /// Whether a stabilization pause (`stabilize_ms > 0`) was applied after
1816 /// navigation.
1817 pub stabilized: bool,
1818}
1819
1820/// Options for [`PageHandle::refresh`].
1821///
1822/// # Example
1823///
1824/// ```
1825/// use stygian_browser::page::{RefreshOptions, WarmupWait};
1826///
1827/// let opts = RefreshOptions {
1828/// wait: WarmupWait::DomContentLoaded,
1829/// timeout_ms: 15_000,
1830/// reset_connection: true,
1831/// };
1832/// assert!(opts.reset_connection);
1833/// ```
1834#[derive(Debug, Clone, Serialize, Deserialize)]
1835pub struct RefreshOptions {
1836 /// Wait strategy applied after the reload (default: `DomContentLoaded`).
1837 #[serde(default)]
1838 pub wait: WarmupWait,
1839 /// Reload timeout in milliseconds. Default: `30 000`.
1840 #[serde(default = "RefreshOptions::default_timeout_ms")]
1841 pub timeout_ms: u64,
1842 /// When `true`, re-navigates to the current URL rather than issuing a
1843 /// browser-level reload. This signals to the calling code that a new TCP
1844 /// connection is desired while cookies and storage are retained in the
1845 /// browser process. Default: `false`.
1846 #[serde(default)]
1847 pub reset_connection: bool,
1848}
1849
1850impl RefreshOptions {
1851 /// Returns the default reload timeout (30 000 ms).
1852 #[must_use]
1853 pub const fn default_timeout_ms() -> u64 {
1854 30_000
1855 }
1856}
1857
1858impl Default for RefreshOptions {
1859 fn default() -> Self {
1860 Self {
1861 wait: WarmupWait::DomContentLoaded,
1862 timeout_ms: Self::default_timeout_ms(),
1863 reset_connection: false,
1864 }
1865 }
1866}
1867
1868/// Diagnostic report produced by [`PageHandle::refresh`].
1869///
1870/// # Example
1871///
1872/// ```
1873/// use stygian_browser::page::RefreshReport;
1874/// let report = RefreshReport {
1875/// url: "https://example.com".to_string(),
1876/// elapsed_ms: 180,
1877/// status_code: Some(200),
1878/// };
1879/// assert_eq!(report.elapsed_ms, 180);
1880/// ```
1881#[derive(Debug, Clone, Serialize, Deserialize)]
1882pub struct RefreshReport {
1883 /// URL of the page after the refresh navigation.
1884 pub url: String,
1885 /// Elapsed wall-time in milliseconds.
1886 pub elapsed_ms: u64,
1887 /// HTTP status code of the refresh navigation, if captured.
1888 pub status_code: Option<u16>,
1889}
1890
1891// ─── PageHandle warmup / refresh ──────────────────────────────────────────────
1892
1893impl PageHandle {
1894 /// Warm up a browser session by navigating to `options.url` and
1895 /// optionally waiting for dynamic resources to settle.
1896 ///
1897 /// Warmup is **idempotent**: calling it repeatedly re-navigates and
1898 /// re-warms the same session without adverse side effects.
1899 ///
1900 /// # Errors
1901 ///
1902 /// Returns [`BrowserError::NavigationFailed`] if the navigation times out
1903 /// or the underlying CDP call fails.
1904 ///
1905 /// # Example
1906 ///
1907 /// ```no_run
1908 /// # async fn run() -> stygian_browser::error::Result<()> {
1909 /// use stygian_browser::{BrowserPool, BrowserConfig};
1910 /// use stygian_browser::page::{WarmupOptions, WarmupWait};
1911 ///
1912 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1913 /// let handle = pool.acquire().await?;
1914 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1915 ///
1916 /// let report = page.warmup(WarmupOptions {
1917 /// url: "https://example.com".to_string(),
1918 /// wait: WarmupWait::DomContentLoaded,
1919 /// timeout_ms: 30_000,
1920 /// stabilize_ms: 500,
1921 /// }).await?;
1922 /// println!("warmed in {}ms: {}", report.elapsed_ms, report.title);
1923 /// handle.release().await;
1924 /// # Ok(())
1925 /// # }
1926 /// ```
1927 pub async fn warmup(&mut self, options: WarmupOptions) -> Result<WarmupReport> {
1928 let start = std::time::Instant::now();
1929 let nav_timeout = Duration::from_millis(options.timeout_ms);
1930 self.navigate(
1931 &options.url,
1932 options.wait.clone().into_wait_until(),
1933 nav_timeout,
1934 )
1935 .await?;
1936 let status_code = self.status_code()?;
1937 let title = self.title().await.unwrap_or_default();
1938 let stabilized = options.stabilize_ms > 0;
1939 if stabilized {
1940 tokio::time::sleep(Duration::from_millis(options.stabilize_ms)).await;
1941 }
1942 let elapsed_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
1943 Ok(WarmupReport {
1944 url: options.url,
1945 elapsed_ms,
1946 status_code,
1947 title,
1948 stabilized,
1949 })
1950 }
1951
1952 /// Refresh the current page, retaining all in-browser session state
1953 /// (cookies, `localStorage`, `sessionStorage`).
1954 ///
1955 /// When `options.reset_connection` is `false` (default) a standard
1956 /// CDP reload is issued. When `true`, the current URL is re-navigated,
1957 /// which expresses the caller's intent to force a new underlying TCP/TLS
1958 /// connection while keeping all browser-side state intact.
1959 ///
1960 /// Refresh is **idempotent**: repeated calls simply reload the page again.
1961 ///
1962 /// # Errors
1963 ///
1964 /// Returns [`BrowserError::NavigationFailed`] if the current URL cannot be
1965 /// determined or the reload times out.
1966 ///
1967 /// # Example
1968 ///
1969 /// ```no_run
1970 /// # async fn run() -> stygian_browser::error::Result<()> {
1971 /// use stygian_browser::{BrowserPool, BrowserConfig};
1972 /// use stygian_browser::page::{RefreshOptions, WaitUntil};
1973 ///
1974 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1975 /// let handle = pool.acquire().await?;
1976 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1977 /// page.navigate(
1978 /// "https://example.com",
1979 /// WaitUntil::DomContentLoaded,
1980 /// std::time::Duration::from_secs(30),
1981 /// ).await?;
1982 ///
1983 /// let report = page.refresh(RefreshOptions::default()).await?;
1984 /// println!("refreshed in {}ms", report.elapsed_ms);
1985 /// handle.release().await;
1986 /// # Ok(())
1987 /// # }
1988 /// ```
1989 pub async fn refresh(&mut self, options: RefreshOptions) -> Result<RefreshReport> {
1990 let start = std::time::Instant::now();
1991 let nav_timeout = Duration::from_millis(options.timeout_ms);
1992 let wait = options.wait.clone().into_wait_until();
1993 // Resolve the current URL before any navigation changes it.
1994 let current_url = self.url().await?;
1995 if current_url.is_empty() || current_url == "about:blank" {
1996 return Err(BrowserError::NavigationFailed {
1997 url: current_url,
1998 reason: "page has not been navigated yet; call warmup() or navigate() first"
1999 .to_string(),
2000 });
2001 }
2002 // Both code paths navigate to the same URL. `reset_connection: true`
2003 // expresses the *intent* to use a new TCP connection; the browser is free
2004 // to reuse or create a new connection as its connection pool dictates.
2005 self.navigate(¤t_url, wait, nav_timeout).await?;
2006 let status_code = self.status_code()?;
2007 let url = self.url().await?;
2008 let elapsed_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
2009 Ok(RefreshReport {
2010 url,
2011 elapsed_ms,
2012 status_code,
2013 })
2014 }
2015}
2016
2017// ─── Tests ────────────────────────────────────────────────────────────────────
2018
2019#[cfg(test)]
2020mod tests {
2021 use super::*;
2022
2023 #[test]
2024 fn resource_filter_block_media_blocks_image() {
2025 let filter = ResourceFilter::block_media();
2026 assert!(filter.should_block("Image"));
2027 assert!(filter.should_block("Font"));
2028 assert!(filter.should_block("Stylesheet"));
2029 assert!(filter.should_block("Media"));
2030 assert!(!filter.should_block("Script"));
2031 assert!(!filter.should_block("XHR"));
2032 }
2033
2034 #[test]
2035 fn resource_filter_case_insensitive() {
2036 let filter = ResourceFilter::block_images_and_fonts();
2037 assert!(filter.should_block("image")); // lowercase
2038 assert!(filter.should_block("IMAGE")); // uppercase
2039 assert!(!filter.should_block("Stylesheet"));
2040 }
2041
2042 #[test]
2043 fn resource_filter_builder_chain() {
2044 let filter = ResourceFilter::default()
2045 .block(ResourceType::Image)
2046 .block(ResourceType::Font);
2047 assert!(filter.should_block("Image"));
2048 assert!(filter.should_block("Font"));
2049 assert!(!filter.should_block("Stylesheet"));
2050 }
2051
2052 #[test]
2053 fn resource_filter_dedup_block() {
2054 let filter = ResourceFilter::default()
2055 .block(ResourceType::Image)
2056 .block(ResourceType::Image); // duplicate
2057 assert_eq!(filter.blocked.len(), 1);
2058 }
2059
2060 #[test]
2061 fn resource_filter_is_empty_when_default() {
2062 assert!(ResourceFilter::default().is_empty());
2063 assert!(!ResourceFilter::block_media().is_empty());
2064 }
2065
2066 #[test]
2067 fn wait_until_selector_stores_string() {
2068 let w = WaitUntil::Selector("#foo".to_string());
2069 assert!(matches!(w, WaitUntil::Selector(ref s) if s == "#foo"));
2070 }
2071
2072 #[test]
2073 fn resource_type_cdp_str() {
2074 assert_eq!(ResourceType::Image.as_cdp_str(), "Image");
2075 assert_eq!(ResourceType::Font.as_cdp_str(), "Font");
2076 assert_eq!(ResourceType::Stylesheet.as_cdp_str(), "Stylesheet");
2077 assert_eq!(ResourceType::Media.as_cdp_str(), "Media");
2078 }
2079
2080 #[test]
2081 fn page_handle_is_send_sync() {
2082 fn assert_send<T: Send>() {}
2083 fn assert_sync<T: Sync>() {}
2084 assert_send::<PageHandle>();
2085 assert_sync::<PageHandle>();
2086 }
2087
2088 /// Verify the resilient extractor correctly classifies `ExtractionError`
2089 /// variants — `Missing` must be treated as "skip", others as hard errors.
2090 #[cfg(feature = "extract")]
2091 #[test]
2092 fn extraction_error_missing_is_skippable() {
2093 use crate::extract::ExtractionError;
2094
2095 let missing = ExtractionError::Missing {
2096 field: "title",
2097 selector: "h1",
2098 };
2099 assert!(
2100 matches!(missing, ExtractionError::Missing { .. }),
2101 "ExtractionError::Missing should be the skip variant"
2102 );
2103
2104 // Non-Missing variants should NOT match the skip pattern
2105 let nested = ExtractionError::Nested {
2106 field: "link",
2107 source: Box::new(ExtractionError::Missing {
2108 field: "href",
2109 selector: "a",
2110 }),
2111 };
2112 assert!(
2113 !matches!(nested, ExtractionError::Missing { .. }),
2114 "ExtractionError::Nested must not match Missing"
2115 );
2116 }
2117
2118 /// `Option<u16>` are pure-logic invariants testable without a live browser.
2119 #[test]
2120 fn status_code_sentinel_zero_maps_to_none() {
2121 use std::sync::atomic::{AtomicU16, Ordering};
2122 let atom = AtomicU16::new(0);
2123 let code = atom.load(Ordering::Acquire);
2124 assert_eq!(if code == 0 { None } else { Some(code) }, None::<u16>);
2125 }
2126
2127 #[test]
2128 fn status_code_non_zero_maps_to_some() {
2129 use std::sync::atomic::{AtomicU16, Ordering};
2130 for &expected in &[200u16, 301, 404, 503] {
2131 let atom = AtomicU16::new(expected);
2132 let code = atom.load(Ordering::Acquire);
2133 assert_eq!(if code == 0 { None } else { Some(code) }, Some(expected));
2134 }
2135 }
2136
2137 // ── NodeHandle pure-logic tests ───────────────────────────────────────────
2138
2139 /// `attr_map` relies on `chunks_exact(2)` — verify the pairing logic is
2140 /// correct without a live browser by exercising it directly.
2141 #[test]
2142 fn attr_map_chunking_pairs_correctly() {
2143 let flat = [
2144 "id".to_string(),
2145 "main".to_string(),
2146 "data-ux".to_string(),
2147 "Section".to_string(),
2148 "class".to_string(),
2149 "container".to_string(),
2150 ];
2151 let mut map = std::collections::HashMap::with_capacity(flat.len() / 2);
2152 for pair in flat.chunks_exact(2) {
2153 if let [name, value] = pair {
2154 map.insert(name.clone(), value.clone());
2155 }
2156 }
2157 assert_eq!(map.get("id").map(String::as_str), Some("main"));
2158 assert_eq!(map.get("data-ux").map(String::as_str), Some("Section"));
2159 assert_eq!(map.get("class").map(String::as_str), Some("container"));
2160 assert_eq!(map.len(), 3);
2161 }
2162
2163 /// gracefully — the trailing element is silently ignored.
2164 #[test]
2165 fn attr_map_chunking_ignores_odd_trailing() {
2166 let flat = ["orphan".to_string()]; // no value
2167 let mut map = std::collections::HashMap::new();
2168 for pair in flat.chunks_exact(2) {
2169 if let [name, value] = pair {
2170 map.insert(name.clone(), value.clone());
2171 }
2172 }
2173 assert!(map.is_empty());
2174 }
2175
2176 /// Empty flat list → empty map.
2177 #[test]
2178 fn attr_map_chunking_empty_input() {
2179 let flat: Vec<String> = vec![];
2180 let map: std::collections::HashMap<String, String> = flat
2181 .chunks_exact(2)
2182 .filter_map(|pair| {
2183 if let [name, value] = pair {
2184 Some((name.clone(), value.clone()))
2185 } else {
2186 None
2187 }
2188 })
2189 .collect();
2190 assert!(map.is_empty());
2191 }
2192
2193 #[test]
2194 fn ancestors_json_parse_round_trip() -> std::result::Result<(), serde_json::Error> {
2195 let json = r#"["p","article","body","html"]"#;
2196 let result: Vec<String> = serde_json::from_str(json)?;
2197 assert_eq!(result, ["p", "article", "body", "html"]);
2198 Ok(())
2199 }
2200
2201 #[test]
2202 fn ancestors_json_parse_empty() -> std::result::Result<(), serde_json::Error> {
2203 let json = "[]";
2204 let result: Vec<String> = serde_json::from_str(json)?;
2205 assert!(result.is_empty());
2206 Ok(())
2207 }
2208
2209 /// `"div::parent"`) must surface that suffix in its `Display` output so
2210 /// callers can locate the failed traversal in logs.
2211 #[test]
2212 fn traversal_selector_suffix_in_stale_error() {
2213 let e = crate::error::BrowserError::StaleNode {
2214 selector: "div::parent".to_string(),
2215 };
2216 let msg = e.to_string();
2217 assert!(
2218 msg.contains("div::parent"),
2219 "StaleNode display must include the full selector; got: {msg}"
2220 );
2221 }
2222
2223 #[test]
2224 fn traversal_next_suffix_in_stale_error() {
2225 let e = crate::error::BrowserError::StaleNode {
2226 selector: "li.price::next".to_string(),
2227 };
2228 assert!(e.to_string().contains("li.price::next"));
2229 }
2230
2231 #[test]
2232 fn traversal_prev_suffix_in_stale_error() {
2233 let e = crate::error::BrowserError::StaleNode {
2234 selector: "td.label::prev".to_string(),
2235 };
2236 assert!(e.to_string().contains("td.label::prev"));
2237 }
2238
2239 // ── Warmup / Refresh type tests ───────────────────────────────────────────
2240
2241 #[test]
2242 fn warmup_options_defaults() {
2243 let opts = WarmupOptions::default();
2244 assert_eq!(opts.wait, WarmupWait::DomContentLoaded);
2245 assert_eq!(opts.timeout_ms, WarmupOptions::default_timeout_ms());
2246 assert_eq!(opts.stabilize_ms, 0);
2247 }
2248
2249 #[test]
2250 fn warmup_options_serialize_round_trip() -> std::result::Result<(), Box<dyn std::error::Error>>
2251 {
2252 let opts = WarmupOptions {
2253 url: "https://example.com".to_string(),
2254 wait: WarmupWait::NetworkIdle,
2255 timeout_ms: 15_000,
2256 stabilize_ms: 250,
2257 };
2258 let json = serde_json::to_string(&opts)?;
2259 let restored: WarmupOptions = serde_json::from_str(&json)?;
2260 assert_eq!(restored.url, "https://example.com");
2261 assert_eq!(restored.wait, WarmupWait::NetworkIdle);
2262 assert_eq!(restored.timeout_ms, 15_000);
2263 assert_eq!(restored.stabilize_ms, 250);
2264 Ok(())
2265 }
2266
2267 #[test]
2268 fn warmup_wait_default_is_dom_content_loaded() {
2269 assert_eq!(WarmupWait::default(), WarmupWait::DomContentLoaded);
2270 }
2271
2272 #[test]
2273 fn warmup_wait_into_wait_until_variants() {
2274 assert!(matches!(
2275 WarmupWait::DomContentLoaded.into_wait_until(),
2276 WaitUntil::DomContentLoaded
2277 ));
2278 assert!(matches!(
2279 WarmupWait::NetworkIdle.into_wait_until(),
2280 WaitUntil::NetworkIdle
2281 ));
2282 }
2283
2284 #[test]
2285 fn refresh_options_defaults() {
2286 let opts = RefreshOptions::default();
2287 assert_eq!(opts.wait, WarmupWait::DomContentLoaded);
2288 assert_eq!(opts.timeout_ms, RefreshOptions::default_timeout_ms());
2289 assert!(!opts.reset_connection);
2290 }
2291
2292 #[test]
2293 fn refresh_options_serialize_round_trip() -> std::result::Result<(), Box<dyn std::error::Error>>
2294 {
2295 let opts = RefreshOptions {
2296 wait: WarmupWait::NetworkIdle,
2297 timeout_ms: 10_000,
2298 reset_connection: true,
2299 };
2300 let json = serde_json::to_string(&opts)?;
2301 let restored: RefreshOptions = serde_json::from_str(&json)?;
2302 assert_eq!(restored.wait, WarmupWait::NetworkIdle);
2303 assert_eq!(restored.timeout_ms, 10_000);
2304 assert!(restored.reset_connection);
2305 Ok(())
2306 }
2307
2308 #[test]
2309 fn warmup_report_serialize_round_trip() -> std::result::Result<(), Box<dyn std::error::Error>> {
2310 let report = WarmupReport {
2311 url: "https://example.com".to_string(),
2312 elapsed_ms: 320,
2313 status_code: Some(200),
2314 title: "Example Domain".to_string(),
2315 stabilized: true,
2316 };
2317 let json = serde_json::to_string(&report)?;
2318 let restored: WarmupReport = serde_json::from_str(&json)?;
2319 assert_eq!(restored.url, "https://example.com");
2320 assert_eq!(restored.elapsed_ms, 320);
2321 assert_eq!(restored.status_code, Some(200));
2322 assert_eq!(restored.title, "Example Domain");
2323 assert!(restored.stabilized);
2324 Ok(())
2325 }
2326
2327 #[test]
2328 fn refresh_report_serialize_round_trip() -> std::result::Result<(), Box<dyn std::error::Error>>
2329 {
2330 let report = RefreshReport {
2331 url: "https://example.com/".to_string(),
2332 elapsed_ms: 180,
2333 status_code: Some(304),
2334 };
2335 let json = serde_json::to_string(&report)?;
2336 let restored: RefreshReport = serde_json::from_str(&json)?;
2337 assert_eq!(restored.url, "https://example.com/");
2338 assert_eq!(restored.elapsed_ms, 180);
2339 assert_eq!(restored.status_code, Some(304));
2340 Ok(())
2341 }
2342
2343 #[test]
2344 fn warmup_options_missing_stabilize_ms_defaults_to_zero()
2345 -> std::result::Result<(), Box<dyn std::error::Error>> {
2346 // stabilize_ms has `#[serde(default)]`; omitting it from JSON should
2347 // deserialize to 0 rather than erroring.
2348 let json = r#"{"url":"https://example.com","timeout_ms":30000}"#;
2349 let opts: WarmupOptions = serde_json::from_str(json)?;
2350 assert_eq!(opts.stabilize_ms, 0);
2351 Ok(())
2352 }
2353
2354 // ── Integration tests (require live Chrome — skipped in CI) ──────────────
2355
2356 /// Warm up a page then immediately extract content from the same origin.
2357 #[test]
2358 #[ignore = "requires live Chrome"]
2359 #[allow(clippy::expect_used)]
2360 fn integration_warmup_then_extraction() {
2361 let rt = tokio::runtime::Runtime::new().expect("tokio runtime");
2362 rt.block_on(async {
2363 use crate::{BrowserConfig, BrowserPool};
2364 let pool = BrowserPool::new(BrowserConfig::default())
2365 .await
2366 .expect("pool");
2367 let handle = pool.acquire().await.expect("handle");
2368 let mut page = handle
2369 .browser()
2370 .expect("browser")
2371 .new_page()
2372 .await
2373 .expect("page");
2374
2375 let report = page
2376 .warmup(WarmupOptions {
2377 url: "https://example.com".to_string(),
2378 wait: WarmupWait::DomContentLoaded,
2379 timeout_ms: 30_000,
2380 stabilize_ms: 0,
2381 })
2382 .await
2383 .expect("warmup");
2384
2385 assert!(!report.title.is_empty(), "title populated after warmup");
2386 assert!(report.elapsed_ms > 0);
2387
2388 // Confirm the page is still usable for further queries.
2389 let html = page.content().await.expect("content");
2390 assert!(
2391 html.contains("example"),
2392 "page content available after warmup"
2393 );
2394
2395 page.close().await.expect("close");
2396 handle.release().await;
2397 });
2398 }
2399
2400 /// Refresh a page and verify session continuity (URL unchanged, page
2401 /// still navigable).
2402 #[test]
2403 #[ignore = "requires live Chrome"]
2404 #[allow(clippy::expect_used)]
2405 fn integration_refresh_keeps_session_state() {
2406 let rt = tokio::runtime::Runtime::new().expect("tokio runtime");
2407 rt.block_on(async {
2408 use crate::{BrowserConfig, BrowserPool};
2409 let pool = BrowserPool::new(BrowserConfig::default())
2410 .await
2411 .expect("pool");
2412 let handle = pool.acquire().await.expect("handle");
2413 let mut page = handle
2414 .browser()
2415 .expect("browser")
2416 .new_page()
2417 .await
2418 .expect("page");
2419
2420 page.navigate(
2421 "https://example.com",
2422 WaitUntil::DomContentLoaded,
2423 Duration::from_secs(30),
2424 )
2425 .await
2426 .expect("initial navigate");
2427
2428 let report = page
2429 .refresh(RefreshOptions::default())
2430 .await
2431 .expect("refresh");
2432
2433 assert!(
2434 report.url.contains("example.com"),
2435 "URL retained after refresh; got: {}",
2436 report.url
2437 );
2438 assert!(report.elapsed_ms > 0);
2439
2440 page.close().await.expect("close");
2441 handle.release().await;
2442 });
2443 }
2444}