stygian_browser/interstitial_router/
classifier.rs1use std::collections::BTreeSet;
25
26use serde::{Deserialize, Serialize};
27
28use super::policy::InterstitialKind;
29
30#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
54pub struct PageSignature {
55 pub url: String,
57 pub status_code: Option<u16>,
59 pub body_markers: Vec<String>,
62 pub header_set: Vec<String>,
65 pub redirect_url: Option<String>,
67 pub queue_position_hint: Option<u32>,
69 pub vendor_hint: Option<String>,
72}
73
74impl PageSignature {
75 #[must_use]
78 pub fn new(url: impl Into<String>, status_code: Option<u16>) -> Self {
79 Self {
80 url: url.into(),
81 status_code,
82 body_markers: Vec::new(),
83 header_set: Vec::new(),
84 redirect_url: None,
85 queue_position_hint: None,
86 vendor_hint: None,
87 }
88 }
89
90 #[must_use]
94 pub fn with_body_marker(mut self, marker: impl Into<String>) -> Self {
95 let marker = marker.into().trim().to_ascii_lowercase();
96 if !marker.is_empty() && !self.body_markers.iter().any(|m| m == &marker) {
97 self.body_markers.push(marker);
98 }
99 self
100 }
101
102 #[must_use]
106 pub fn with_header(mut self, header: impl Into<String>) -> Self {
107 let header = header.into().trim().to_ascii_lowercase();
108 if !header.is_empty() && !self.header_set.iter().any(|h| h == &header) {
109 self.header_set.push(header);
110 }
111 self
112 }
113
114 #[must_use]
116 pub fn with_redirect_url(mut self, redirect_url: impl Into<String>) -> Self {
117 self.redirect_url = Some(redirect_url.into());
118 self
119 }
120
121 #[must_use]
123 pub const fn with_queue_position(mut self, position: u32) -> Self {
124 self.queue_position_hint = Some(position);
125 self
126 }
127
128 #[must_use]
130 pub fn with_vendor_hint(mut self, vendor: impl Into<String>) -> Self {
131 self.vendor_hint = Some(vendor.into());
132 self
133 }
134
135 #[must_use]
137 pub fn with_body_markers(mut self, markers: Vec<String>) -> Self {
138 self.body_markers = markers;
139 self
140 }
141
142 #[must_use]
144 pub fn with_header_set(mut self, headers: Vec<String>) -> Self {
145 self.header_set = headers;
146 self
147 }
148
149 #[must_use]
153 pub fn host(&self) -> Option<String> {
154 let url = self.url.trim();
155 if url.is_empty() {
156 return None;
157 }
158 let without_scheme = url.split_once("://")?.1;
159 let authority = without_scheme.split('/').next()?;
160 let host = authority.rsplit('@').next()?.split(':').next()?;
161 if host.is_empty() {
162 None
163 } else {
164 Some(host.to_ascii_lowercase())
165 }
166 }
167
168 #[must_use]
171 pub fn url_contains(&self, needle_lower: &str) -> bool {
172 self.url.to_ascii_lowercase().contains(needle_lower)
173 }
174
175 #[must_use]
178 pub fn body_contains(&self, needle_lower: &str) -> bool {
179 self.body_markers
180 .iter()
181 .any(|m| m.to_ascii_lowercase().contains(needle_lower))
182 }
183
184 #[must_use]
187 pub fn has_header(&self, name_lower: &str) -> bool {
188 self.header_set
189 .iter()
190 .any(|h| h.eq_ignore_ascii_case(name_lower))
191 }
192
193 #[must_use]
195 pub fn unique_headers(&self) -> BTreeSet<String> {
196 self.header_set
197 .iter()
198 .map(|h| h.to_ascii_lowercase())
199 .collect()
200 }
201}
202
203#[derive(Debug, Clone, Default)]
223pub struct InterstitialClassifier {
224 _private: (),
225}
226
227impl InterstitialClassifier {
228 #[must_use]
230 pub const fn new() -> Self {
231 Self { _private: () }
232 }
233
234 #[must_use]
241 pub fn classify(&self, signature: &PageSignature) -> InterstitialKind {
242 if is_hard_block(signature) {
244 return InterstitialKind::HardBlock;
245 }
246
247 if is_challenge(signature) {
249 return InterstitialKind::Challenge;
250 }
251
252 if is_queue(signature) {
254 return InterstitialKind::Queue;
255 }
256
257 if is_transient(signature) {
260 return InterstitialKind::Transient;
261 }
262
263 InterstitialKind::Transient
266 }
267}
268
269fn is_hard_block(signature: &PageSignature) -> bool {
270 if matches!(signature.status_code, Some(403 | 503)) {
273 if HARD_BLOCK_URL_PATTERNS
277 .iter()
278 .any(|p| signature.url_contains(p))
279 {
280 return true;
281 }
282 if HARD_BLOCK_BODY_MARKERS
283 .iter()
284 .any(|m| signature.body_contains(m))
285 {
286 return true;
287 }
288 if signature
289 .vendor_hint
290 .as_deref()
291 .is_some_and(is_hard_block_vendor)
292 {
293 return true;
294 }
295 }
296
297 if matches!(signature.status_code, Some(429))
299 && HARD_BLOCK_BODY_MARKERS
300 .iter()
301 .any(|m| signature.body_contains(m))
302 {
303 return true;
304 }
305
306 if signature.status_code.is_none()
308 && HARD_BLOCK_URL_PATTERNS
309 .iter()
310 .any(|p| signature.url_contains(p))
311 {
312 return true;
313 }
314
315 false
316}
317
318fn is_challenge(signature: &PageSignature) -> bool {
319 if CHALLENGE_BODY_MARKERS
320 .iter()
321 .any(|m| signature.body_contains(m))
322 {
323 return true;
324 }
325 if CHALLENGE_URL_PATTERNS
326 .iter()
327 .any(|p| signature.url_contains(p))
328 {
329 return true;
330 }
331 if CHALLENGE_HEADERS.iter().any(|h| signature.has_header(h)) {
332 return true;
333 }
334 signature
335 .vendor_hint
336 .as_deref()
337 .is_some_and(is_challenge_vendor)
338}
339
340fn is_queue(signature: &PageSignature) -> bool {
341 if signature.queue_position_hint.is_some() {
342 return true;
343 }
344 if QUEUE_BODY_MARKERS
345 .iter()
346 .any(|m| signature.body_contains(m))
347 {
348 return true;
349 }
350 if QUEUE_URL_PATTERNS.iter().any(|p| signature.url_contains(p)) {
351 return true;
352 }
353 if matches!(signature.status_code, Some(202)) {
354 return true;
355 }
356 false
357}
358
359fn is_transient(signature: &PageSignature) -> bool {
360 matches!(signature.status_code, Some(301 | 302 | 303 | 307 | 308))
361 || signature.redirect_url.is_some()
362 || signature.url_contains("/redirect")
363 || signature.url_contains("/continue")
364}
365
366pub(super) const HARD_BLOCK_BODY_MARKERS_PUBLIC: &[&str] = &[
367 "access denied",
368 "request blocked",
369 "you have been blocked",
370 "we have detected unusual traffic",
371 "this site has been blocked",
372 "your request has been denied",
373 "forbidden",
374];
375
376const HARD_BLOCK_BODY_MARKERS: &[&str] = HARD_BLOCK_BODY_MARKERS_PUBLIC;
377
378pub(super) const HARD_BLOCK_URL_PATTERNS_PUBLIC: &[&str] = &[
379 "/blocked",
380 "/forbidden",
381 "/denied",
382 "/err/blocked",
383 "/err/forbidden",
384 "/banned",
385];
386
387const HARD_BLOCK_URL_PATTERNS: &[&str] = HARD_BLOCK_URL_PATTERNS_PUBLIC;
388
389pub(super) const HARD_BLOCK_VENDOR_HINTS_PUBLIC: &[&str] = &["blacklist", "firewall-block"];
390
391const HARD_BLOCK_VENDOR_HINTS: &[&str] = HARD_BLOCK_VENDOR_HINTS_PUBLIC;
392
393fn is_hard_block_vendor(vendor: &str) -> bool {
394 HARD_BLOCK_VENDOR_HINTS
395 .iter()
396 .any(|h| vendor.eq_ignore_ascii_case(h))
397}
398
399pub(super) const CHALLENGE_BODY_MARKERS_PUBLIC: &[&str] = &[
400 "cf-chl-bypass",
401 "cf-challenge",
402 "cf-turnstile",
403 "challenge-platform",
404 "checking your browser",
405 "just a moment",
406 "g-recaptcha",
407 "h-captcha",
408 "hcaptcha",
409 "arkose",
410 "perimeterx",
411 "perimeter x",
412 "press & hold",
413 "press and hold",
414 "akamai bot manager",
415 "akamai_bm",
416 "fingerprint.com",
417 "shape security",
418 "kasada",
419 "datadome",
420 "px-captcha",
421 "_abck",
422];
423
424const CHALLENGE_BODY_MARKERS: &[&str] = CHALLENGE_BODY_MARKERS_PUBLIC;
425
426pub(super) const CHALLENGE_URL_PATTERNS_PUBLIC: &[&str] = &[
427 "/cdn-cgi/challenge-platform",
428 "/cdn-cgi/challenge",
429 "/challenge-platform",
430 "/_px/",
431 "/_abck",
432 "/captcha",
433 "/__challenge",
434 "/arkose",
435 "/px/validate",
436 "/fingerprint",
437 "/datadome",
438];
439
440const CHALLENGE_URL_PATTERNS: &[&str] = CHALLENGE_URL_PATTERNS_PUBLIC;
441
442pub(super) const CHALLENGE_HEADERS_PUBLIC: &[&str] = &[
443 "cf-mitigated",
444 "cf-chl-bypass",
445 "x-captcha",
446 "x-akamai-bot",
447 "x-datadome",
448 "x-perimeterx",
449];
450
451const CHALLENGE_HEADERS: &[&str] = CHALLENGE_HEADERS_PUBLIC;
452
453pub(super) const CHALLENGE_VENDOR_HINTS_PUBLIC: &[&str] = &[
454 "cloudflare",
455 "akamai",
456 "akamai_bot_manager",
457 "perimeterx",
458 "perimeter_x",
459 "datadome",
460 "shape_security",
461 "kasada",
462 "fingerprint_com",
463 "fingerprintcom",
464 "hcaptcha",
465 "recaptcha",
466];
467
468const CHALLENGE_VENDOR_HINTS: &[&str] = CHALLENGE_VENDOR_HINTS_PUBLIC;
469
470fn is_challenge_vendor(vendor: &str) -> bool {
471 CHALLENGE_VENDOR_HINTS
472 .iter()
473 .any(|h| vendor.eq_ignore_ascii_case(h))
474}
475
476pub(super) const QUEUE_BODY_MARKERS_PUBLIC: &[&str] = &[
477 "please wait",
478 "you are in line",
479 "queue position",
480 "your place in line",
481 "estimated wait",
482 "waiting room",
483 "one moment please",
484 "almost done",
485];
486
487const QUEUE_BODY_MARKERS: &[&str] = QUEUE_BODY_MARKERS_PUBLIC;
488
489pub(super) const QUEUE_URL_PATTERNS_PUBLIC: &[&str] =
490 &["/queue", "/waiting", "/wait-room", "/waitroom"];
491
492const QUEUE_URL_PATTERNS: &[&str] = QUEUE_URL_PATTERNS_PUBLIC;
493
494#[cfg(test)]
497mod tests {
498 use super::*;
499
500 #[test]
501 fn classifier_identifies_queue_via_body_marker() {
502 let classifier = InterstitialClassifier::new();
503 let sig = PageSignature::new("https://example.com/queue", Some(200))
504 .with_body_marker("please wait")
505 .with_queue_position(3);
506 assert_eq!(classifier.classify(&sig), InterstitialKind::Queue);
507 }
508
509 #[test]
510 fn classifier_identifies_challenge_via_captcha_marker() {
511 let classifier = InterstitialClassifier::new();
512 let sig = PageSignature::new(
513 "https://example.com/cdn-cgi/challenge-platform/h/b",
514 Some(403),
515 )
516 .with_body_marker("cf-chl-bypass")
517 .with_header("cf-mitigated")
518 .with_vendor_hint("cloudflare");
519 assert_eq!(classifier.classify(&sig), InterstitialKind::Challenge);
520 }
521
522 #[test]
523 fn classifier_identifies_hard_block_via_status_and_marker() {
524 let classifier = InterstitialClassifier::new();
525 let sig = PageSignature::new("https://example.com/blocked", Some(403))
526 .with_body_marker("access denied");
527 assert_eq!(classifier.classify(&sig), InterstitialKind::HardBlock);
528 }
529
530 #[test]
531 fn classifier_identifies_transient_via_3xx_redirect() {
532 let classifier = InterstitialClassifier::new();
533 let sig = PageSignature::new("https://example.com/redirect", Some(302));
534 assert_eq!(classifier.classify(&sig), InterstitialKind::Transient);
535 }
536
537 #[test]
538 fn classifier_default_unclassified_is_transient() {
539 let classifier = InterstitialClassifier::new();
540 let sig = PageSignature::new("https://example.com/some-page", Some(200));
541 assert_eq!(classifier.classify(&sig), InterstitialKind::Transient);
542 }
543
544 #[test]
545 fn classifier_is_deterministic_for_identical_signatures() {
546 let classifier = InterstitialClassifier::new();
547 let sig = PageSignature::new("https://example.com/blocked", Some(403))
548 .with_body_marker("access denied");
549 let a = classifier.classify(&sig);
550 let b = classifier.classify(&sig);
551 assert_eq!(a, b);
552 assert_eq!(a, InterstitialKind::HardBlock);
553 }
554
555 #[test]
556 fn classifier_precedence_hard_block_wins_over_challenge() {
557 let classifier = InterstitialClassifier::new();
558 let sig = PageSignature::new("https://example.com/blocked", Some(403))
561 .with_body_marker("access denied")
562 .with_body_marker("cf-chl-bypass");
563 assert_eq!(classifier.classify(&sig), InterstitialKind::HardBlock);
564 }
565
566 #[test]
567 fn classifier_precedence_challenge_wins_over_queue() {
568 let classifier = InterstitialClassifier::new();
569 let sig = PageSignature::new(
571 "https://example.com/cdn-cgi/challenge-platform/h/b",
572 Some(403),
573 )
574 .with_body_marker("cf-chl-bypass")
575 .with_body_marker("please wait");
576 assert_eq!(classifier.classify(&sig), InterstitialKind::Challenge);
577 }
578
579 #[test]
580 fn page_signature_builder_dedupes_markers() {
581 let sig = PageSignature::new("https://example.com", None)
582 .with_body_marker("please wait")
583 .with_body_marker("please wait")
584 .with_body_marker("Please Wait")
585 .with_header("x-foo")
586 .with_header("X-Foo");
587 assert_eq!(sig.body_markers.len(), 1);
588 assert_eq!(sig.header_set.len(), 1);
589 }
590
591 #[test]
592 fn page_signature_host_extracts_lowercase_authority() {
593 let sig = PageSignature::new("https://User:Pass@Example.COM:8443/path", None);
594 assert_eq!(sig.host().as_deref(), Some("example.com"));
595 }
596
597 #[test]
598 fn page_signature_host_returns_none_for_empty() {
599 let sig = PageSignature::new("", None);
600 assert!(sig.host().is_none());
601 }
602}