Skip to main content

stygian_browser/interstitial_router/
classifier.rs

1//! Pure deterministic interstitial classifier.
2//!
3//! Consumes a [`PageSignature`] and returns the matching
4//! [`InterstitialKind`]. The classifier is a finite cascade
5//! of structural rules:
6//!
7//! 1. **Hard block** — terminal block markers in the body
8//!    or status, or a URL pointing at a known block
9//!    endpoint.
10//! 2. **Challenge** — vendor-issued challenge markers in
11//!    the body, URL, or headers (Cloudflare `cf-chl-bypass`,
12//!    hCaptcha, reCAPTCHA, Akamai `_abck`, `PerimeterX`
13//!    `_px`, etc.).
14//! 3. **Queue** — "please wait" / waiting-room markers in
15//!    the body, an explicit queue position hint, or a
16//!    202/302 with queue markers.
17//! 4. **Transient** — `3xx` redirect with no queue/challenge
18//!    markers.
19//! 5. **Default: `Transient`** — unclassified signatures
20//!    fall through to the transient (generic retry) bucket
21//!    so the runner can take the normal ladder without
22//!    penalising unrecognised pages.
23
24use std::collections::BTreeSet;
25
26use serde::{Deserialize, Serialize};
27
28use super::policy::InterstitialKind;
29
30/// Page signature consumed by the [`InterstitialClassifier`].
31///
32/// The signature is the **observation** that a previous
33/// acquisition attempt produced. Callers attach the
34/// signature to an [`AcquisitionRequest`][crate::acquisition::AcquisitionRequest]
35/// via the
36/// [`AcquisitionRequest::interstitial`][crate::acquisition::AcquisitionRequest::interstitial]
37/// field (see `mod.rs` for the runner integration).
38///
39/// # Example
40///
41/// ```
42/// use stygian_browser::interstitial_router::PageSignature;
43///
44/// let signature = PageSignature::new(
45///     "https://example.com/cdn-cgi/challenge-platform/h/b",
46///     Some(403),
47/// )
48/// .with_body_marker("cf-chl-bypass")
49/// .with_header("cf-mitigated");
50/// assert_eq!(signature.body_markers.len(), 1);
51/// assert_eq!(signature.header_set.len(), 1);
52/// ```
53#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
54pub struct PageSignature {
55    /// Target URL of the page.
56    pub url: String,
57    /// HTTP status code, when known.
58    pub status_code: Option<u16>,
59    /// Body substrings (case-insensitive) observed in the
60    /// page. Markers are normalised to lower-case ASCII.
61    pub body_markers: Vec<String>,
62    /// Lower-case ASCII header names observed in the
63    /// response.
64    pub header_set: Vec<String>,
65    /// Optional redirect target for a 3xx response.
66    pub redirect_url: Option<String>,
67    /// Optional queue position hint (1-based).
68    pub queue_position_hint: Option<u32>,
69    /// Optional vendor hint (e.g. `cloudflare`,
70    /// `akamai`).
71    pub vendor_hint: Option<String>,
72}
73
74impl PageSignature {
75    /// Build a signature with the supplied `url` and
76    /// `status_code` and no other fields set.
77    #[must_use]
78    pub fn new(url: impl Into<String>, status_code: Option<u16>) -> Self {
79        Self {
80            url: url.into(),
81            status_code,
82            body_markers: Vec::new(),
83            header_set: Vec::new(),
84            redirect_url: None,
85            queue_position_hint: None,
86            vendor_hint: None,
87        }
88    }
89
90    /// Builder: add a body marker (case-insensitive). The
91    /// marker is trimmed and lower-cased; empty markers are
92    /// ignored.
93    #[must_use]
94    pub fn with_body_marker(mut self, marker: impl Into<String>) -> Self {
95        let marker = marker.into().trim().to_ascii_lowercase();
96        if !marker.is_empty() && !self.body_markers.iter().any(|m| m == &marker) {
97            self.body_markers.push(marker);
98        }
99        self
100    }
101
102    /// Builder: add a header name (case-insensitive). The
103    /// name is trimmed and lower-cased; empty names are
104    /// ignored.
105    #[must_use]
106    pub fn with_header(mut self, header: impl Into<String>) -> Self {
107        let header = header.into().trim().to_ascii_lowercase();
108        if !header.is_empty() && !self.header_set.iter().any(|h| h == &header) {
109            self.header_set.push(header);
110        }
111        self
112    }
113
114    /// Builder: set the redirect target.
115    #[must_use]
116    pub fn with_redirect_url(mut self, redirect_url: impl Into<String>) -> Self {
117        self.redirect_url = Some(redirect_url.into());
118        self
119    }
120
121    /// Builder: set the queue position hint.
122    #[must_use]
123    pub const fn with_queue_position(mut self, position: u32) -> Self {
124        self.queue_position_hint = Some(position);
125        self
126    }
127
128    /// Builder: set the vendor hint.
129    #[must_use]
130    pub fn with_vendor_hint(mut self, vendor: impl Into<String>) -> Self {
131        self.vendor_hint = Some(vendor.into());
132        self
133    }
134
135    /// Builder: replace the body marker set.
136    #[must_use]
137    pub fn with_body_markers(mut self, markers: Vec<String>) -> Self {
138        self.body_markers = markers;
139        self
140    }
141
142    /// Builder: replace the header set.
143    #[must_use]
144    pub fn with_header_set(mut self, headers: Vec<String>) -> Self {
145        self.header_set = headers;
146        self
147    }
148
149    /// Lower-case ASCII view of the URL host, when
150    /// parseable. Returns `None` when the URL is empty or
151    /// malformed.
152    #[must_use]
153    pub fn host(&self) -> Option<String> {
154        let url = self.url.trim();
155        if url.is_empty() {
156            return None;
157        }
158        let without_scheme = url.split_once("://")?.1;
159        let authority = without_scheme.split('/').next()?;
160        let host = authority.rsplit('@').next()?.split(':').next()?;
161        if host.is_empty() {
162            None
163        } else {
164            Some(host.to_ascii_lowercase())
165        }
166    }
167
168    /// `true` when the URL path (or query) contains the
169    /// given lower-case substring.
170    #[must_use]
171    pub fn url_contains(&self, needle_lower: &str) -> bool {
172        self.url.to_ascii_lowercase().contains(needle_lower)
173    }
174
175    /// `true` when any of the body markers contain the
176    /// given lower-case substring.
177    #[must_use]
178    pub fn body_contains(&self, needle_lower: &str) -> bool {
179        self.body_markers
180            .iter()
181            .any(|m| m.to_ascii_lowercase().contains(needle_lower))
182    }
183
184    /// `true` when the header set contains the given
185    /// lower-case header name.
186    #[must_use]
187    pub fn has_header(&self, name_lower: &str) -> bool {
188        self.header_set
189            .iter()
190            .any(|h| h.eq_ignore_ascii_case(name_lower))
191    }
192
193    /// Lower-case unique header set (for diagnostics).
194    #[must_use]
195    pub fn unique_headers(&self) -> BTreeSet<String> {
196        self.header_set
197            .iter()
198            .map(|h| h.to_ascii_lowercase())
199            .collect()
200    }
201}
202
203/// Deterministic interstitial classifier.
204///
205/// The classifier is a pure function
206/// `&PageSignature -> InterstitialKind`. It performs no I/O
207/// and reads no clock — it can be unit-tested across the
208/// full rule matrix without booting Chrome.
209///
210/// # Example
211///
212/// ```
213/// use stygian_browser::interstitial_router::{
214///     InterstitialClassifier, InterstitialKind, PageSignature,
215/// };
216///
217/// let classifier = InterstitialClassifier::new();
218/// let sig = PageSignature::new("https://example.com/queue", Some(200))
219///     .with_body_marker("please wait");
220/// assert_eq!(classifier.classify(&sig), InterstitialKind::Queue);
221/// ```
222#[derive(Debug, Clone, Default)]
223pub struct InterstitialClassifier {
224    _private: (),
225}
226
227impl InterstitialClassifier {
228    /// Build a default classifier.
229    #[must_use]
230    pub const fn new() -> Self {
231        Self { _private: () }
232    }
233
234    /// Classify `signature` into an [`InterstitialKind`].
235    ///
236    /// The function is a finite cascade: hard block →
237    /// challenge → queue → transient → transient
238    /// (default). The first rule that matches wins. The
239    /// rules are documented in the module-level doc.
240    #[must_use]
241    pub fn classify(&self, signature: &PageSignature) -> InterstitialKind {
242        // 1. Hard block.
243        if is_hard_block(signature) {
244            return InterstitialKind::HardBlock;
245        }
246
247        // 2. Challenge.
248        if is_challenge(signature) {
249            return InterstitialKind::Challenge;
250        }
251
252        // 3. Queue.
253        if is_queue(signature) {
254            return InterstitialKind::Queue;
255        }
256
257        // 4. Transient: any 3xx with no body markers, or
258        //    a URL that looks like a redirect.
259        if is_transient(signature) {
260            return InterstitialKind::Transient;
261        }
262
263        // 5. Default: transient (most permissive — runner
264        //    falls through to the normal ladder).
265        InterstitialKind::Transient
266    }
267}
268
269fn is_hard_block(signature: &PageSignature) -> bool {
270    // Status code 403 + block markers, or 503, or a known
271    // block URL pattern.
272    if matches!(signature.status_code, Some(403 | 503)) {
273        // 403 alone isn't a hard block — only when paired
274        // with a block marker, a block URL, or a hard
275        // block vendor hint.
276        if HARD_BLOCK_URL_PATTERNS
277            .iter()
278            .any(|p| signature.url_contains(p))
279        {
280            return true;
281        }
282        if HARD_BLOCK_BODY_MARKERS
283            .iter()
284            .any(|m| signature.body_contains(m))
285        {
286            return true;
287        }
288        if signature
289            .vendor_hint
290            .as_deref()
291            .is_some_and(is_hard_block_vendor)
292        {
293            return true;
294        }
295    }
296
297    // 429 with a block body is also a hard block.
298    if matches!(signature.status_code, Some(429))
299        && HARD_BLOCK_BODY_MARKERS
300            .iter()
301            .any(|m| signature.body_contains(m))
302    {
303        return true;
304    }
305
306    // URL-only pattern (no status known).
307    if signature.status_code.is_none()
308        && HARD_BLOCK_URL_PATTERNS
309            .iter()
310            .any(|p| signature.url_contains(p))
311    {
312        return true;
313    }
314
315    false
316}
317
318fn is_challenge(signature: &PageSignature) -> bool {
319    if CHALLENGE_BODY_MARKERS
320        .iter()
321        .any(|m| signature.body_contains(m))
322    {
323        return true;
324    }
325    if CHALLENGE_URL_PATTERNS
326        .iter()
327        .any(|p| signature.url_contains(p))
328    {
329        return true;
330    }
331    if CHALLENGE_HEADERS.iter().any(|h| signature.has_header(h)) {
332        return true;
333    }
334    signature
335        .vendor_hint
336        .as_deref()
337        .is_some_and(is_challenge_vendor)
338}
339
340fn is_queue(signature: &PageSignature) -> bool {
341    if signature.queue_position_hint.is_some() {
342        return true;
343    }
344    if QUEUE_BODY_MARKERS
345        .iter()
346        .any(|m| signature.body_contains(m))
347    {
348        return true;
349    }
350    if QUEUE_URL_PATTERNS.iter().any(|p| signature.url_contains(p)) {
351        return true;
352    }
353    if matches!(signature.status_code, Some(202)) {
354        return true;
355    }
356    false
357}
358
359fn is_transient(signature: &PageSignature) -> bool {
360    matches!(signature.status_code, Some(301 | 302 | 303 | 307 | 308))
361        || signature.redirect_url.is_some()
362        || signature.url_contains("/redirect")
363        || signature.url_contains("/continue")
364}
365
366pub(super) const HARD_BLOCK_BODY_MARKERS_PUBLIC: &[&str] = &[
367    "access denied",
368    "request blocked",
369    "you have been blocked",
370    "we have detected unusual traffic",
371    "this site has been blocked",
372    "your request has been denied",
373    "forbidden",
374];
375
376const HARD_BLOCK_BODY_MARKERS: &[&str] = HARD_BLOCK_BODY_MARKERS_PUBLIC;
377
378pub(super) const HARD_BLOCK_URL_PATTERNS_PUBLIC: &[&str] = &[
379    "/blocked",
380    "/forbidden",
381    "/denied",
382    "/err/blocked",
383    "/err/forbidden",
384    "/banned",
385];
386
387const HARD_BLOCK_URL_PATTERNS: &[&str] = HARD_BLOCK_URL_PATTERNS_PUBLIC;
388
389pub(super) const HARD_BLOCK_VENDOR_HINTS_PUBLIC: &[&str] = &["blacklist", "firewall-block"];
390
391const HARD_BLOCK_VENDOR_HINTS: &[&str] = HARD_BLOCK_VENDOR_HINTS_PUBLIC;
392
393fn is_hard_block_vendor(vendor: &str) -> bool {
394    HARD_BLOCK_VENDOR_HINTS
395        .iter()
396        .any(|h| vendor.eq_ignore_ascii_case(h))
397}
398
399pub(super) const CHALLENGE_BODY_MARKERS_PUBLIC: &[&str] = &[
400    "cf-chl-bypass",
401    "cf-challenge",
402    "cf-turnstile",
403    "challenge-platform",
404    "checking your browser",
405    "just a moment",
406    "g-recaptcha",
407    "h-captcha",
408    "hcaptcha",
409    "arkose",
410    "perimeterx",
411    "perimeter x",
412    "press & hold",
413    "press and hold",
414    "akamai bot manager",
415    "akamai_bm",
416    "fingerprint.com",
417    "shape security",
418    "kasada",
419    "datadome",
420    "px-captcha",
421    "_abck",
422];
423
424const CHALLENGE_BODY_MARKERS: &[&str] = CHALLENGE_BODY_MARKERS_PUBLIC;
425
426pub(super) const CHALLENGE_URL_PATTERNS_PUBLIC: &[&str] = &[
427    "/cdn-cgi/challenge-platform",
428    "/cdn-cgi/challenge",
429    "/challenge-platform",
430    "/_px/",
431    "/_abck",
432    "/captcha",
433    "/__challenge",
434    "/arkose",
435    "/px/validate",
436    "/fingerprint",
437    "/datadome",
438];
439
440const CHALLENGE_URL_PATTERNS: &[&str] = CHALLENGE_URL_PATTERNS_PUBLIC;
441
442pub(super) const CHALLENGE_HEADERS_PUBLIC: &[&str] = &[
443    "cf-mitigated",
444    "cf-chl-bypass",
445    "x-captcha",
446    "x-akamai-bot",
447    "x-datadome",
448    "x-perimeterx",
449];
450
451const CHALLENGE_HEADERS: &[&str] = CHALLENGE_HEADERS_PUBLIC;
452
453pub(super) const CHALLENGE_VENDOR_HINTS_PUBLIC: &[&str] = &[
454    "cloudflare",
455    "akamai",
456    "akamai_bot_manager",
457    "perimeterx",
458    "perimeter_x",
459    "datadome",
460    "shape_security",
461    "kasada",
462    "fingerprint_com",
463    "fingerprintcom",
464    "hcaptcha",
465    "recaptcha",
466];
467
468const CHALLENGE_VENDOR_HINTS: &[&str] = CHALLENGE_VENDOR_HINTS_PUBLIC;
469
470fn is_challenge_vendor(vendor: &str) -> bool {
471    CHALLENGE_VENDOR_HINTS
472        .iter()
473        .any(|h| vendor.eq_ignore_ascii_case(h))
474}
475
476pub(super) const QUEUE_BODY_MARKERS_PUBLIC: &[&str] = &[
477    "please wait",
478    "you are in line",
479    "queue position",
480    "your place in line",
481    "estimated wait",
482    "waiting room",
483    "one moment please",
484    "almost done",
485];
486
487const QUEUE_BODY_MARKERS: &[&str] = QUEUE_BODY_MARKERS_PUBLIC;
488
489pub(super) const QUEUE_URL_PATTERNS_PUBLIC: &[&str] =
490    &["/queue", "/waiting", "/wait-room", "/waitroom"];
491
492const QUEUE_URL_PATTERNS: &[&str] = QUEUE_URL_PATTERNS_PUBLIC;
493
494// ─── Tests ────────────────────────────────────────────────────────────────────
495
496#[cfg(test)]
497mod tests {
498    use super::*;
499
500    #[test]
501    fn classifier_identifies_queue_via_body_marker() {
502        let classifier = InterstitialClassifier::new();
503        let sig = PageSignature::new("https://example.com/queue", Some(200))
504            .with_body_marker("please wait")
505            .with_queue_position(3);
506        assert_eq!(classifier.classify(&sig), InterstitialKind::Queue);
507    }
508
509    #[test]
510    fn classifier_identifies_challenge_via_captcha_marker() {
511        let classifier = InterstitialClassifier::new();
512        let sig = PageSignature::new(
513            "https://example.com/cdn-cgi/challenge-platform/h/b",
514            Some(403),
515        )
516        .with_body_marker("cf-chl-bypass")
517        .with_header("cf-mitigated")
518        .with_vendor_hint("cloudflare");
519        assert_eq!(classifier.classify(&sig), InterstitialKind::Challenge);
520    }
521
522    #[test]
523    fn classifier_identifies_hard_block_via_status_and_marker() {
524        let classifier = InterstitialClassifier::new();
525        let sig = PageSignature::new("https://example.com/blocked", Some(403))
526            .with_body_marker("access denied");
527        assert_eq!(classifier.classify(&sig), InterstitialKind::HardBlock);
528    }
529
530    #[test]
531    fn classifier_identifies_transient_via_3xx_redirect() {
532        let classifier = InterstitialClassifier::new();
533        let sig = PageSignature::new("https://example.com/redirect", Some(302));
534        assert_eq!(classifier.classify(&sig), InterstitialKind::Transient);
535    }
536
537    #[test]
538    fn classifier_default_unclassified_is_transient() {
539        let classifier = InterstitialClassifier::new();
540        let sig = PageSignature::new("https://example.com/some-page", Some(200));
541        assert_eq!(classifier.classify(&sig), InterstitialKind::Transient);
542    }
543
544    #[test]
545    fn classifier_is_deterministic_for_identical_signatures() {
546        let classifier = InterstitialClassifier::new();
547        let sig = PageSignature::new("https://example.com/blocked", Some(403))
548            .with_body_marker("access denied");
549        let a = classifier.classify(&sig);
550        let b = classifier.classify(&sig);
551        assert_eq!(a, b);
552        assert_eq!(a, InterstitialKind::HardBlock);
553    }
554
555    #[test]
556    fn classifier_precedence_hard_block_wins_over_challenge() {
557        let classifier = InterstitialClassifier::new();
558        // A signature that has BOTH hard-block and challenge markers
559        // must classify as hard block (higher precedence).
560        let sig = PageSignature::new("https://example.com/blocked", Some(403))
561            .with_body_marker("access denied")
562            .with_body_marker("cf-chl-bypass");
563        assert_eq!(classifier.classify(&sig), InterstitialKind::HardBlock);
564    }
565
566    #[test]
567    fn classifier_precedence_challenge_wins_over_queue() {
568        let classifier = InterstitialClassifier::new();
569        // Both challenge AND queue markers: challenge wins.
570        let sig = PageSignature::new(
571            "https://example.com/cdn-cgi/challenge-platform/h/b",
572            Some(403),
573        )
574        .with_body_marker("cf-chl-bypass")
575        .with_body_marker("please wait");
576        assert_eq!(classifier.classify(&sig), InterstitialKind::Challenge);
577    }
578
579    #[test]
580    fn page_signature_builder_dedupes_markers() {
581        let sig = PageSignature::new("https://example.com", None)
582            .with_body_marker("please wait")
583            .with_body_marker("please wait")
584            .with_body_marker("Please Wait")
585            .with_header("x-foo")
586            .with_header("X-Foo");
587        assert_eq!(sig.body_markers.len(), 1);
588        assert_eq!(sig.header_set.len(), 1);
589    }
590
591    #[test]
592    fn page_signature_host_extracts_lowercase_authority() {
593        let sig = PageSignature::new("https://User:Pass@Example.COM:8443/path", None);
594        assert_eq!(sig.host().as_deref(), Some("example.com"));
595    }
596
597    #[test]
598    fn page_signature_host_returns_none_for_empty() {
599        let sig = PageSignature::new("", None);
600        assert!(sig.host().is_none());
601    }
602}