stygian_charon/vendor_classifier/classifier.rs
1//! Vendor classification engine (T89).
2//!
3//! The [`VendorClassifier`] is a deterministic, evidence-emitting
4//! ranker that consumes cookies, headers, challenge URLs, and body
5//! markers and produces a ranked vendor scoreboard. It is the
6//! primary input to T88 (anti-bot change-detection feed) and T90
7//! (vendor-to-playbook auto-resolution).
8//!
9//! ## Confidence formula
10//!
11//! For each [`VendorDefinition`] the classifier sums the weights
12//! of the matched signals. The **top vendor**'s confidence is then
13//!
14//! ```text
15//! confidence = top_score / (top_score + second_score)
16//! ```
17//!
18//! which is the same Jaccard-style ratio the existing
19//! [`crate::classifier::classify_transaction`] uses. When only one
20//! vendor matched, `confidence = 1.0`. When no vendor matched, the
21//! classification is reported as [`VendorId::Unknown`] with
22//! `confidence = 0.0`.
23//!
24//! ## Deterministic tie-break rule
25//!
26//! When two or more vendors tie on the **same top score**, the
27//! tie is broken by [`VendorId`] discriminant order: the variant
28//! declared **earlier** in the enum wins. This means
29//! `Akamai < Cloudflare < DataDome < PerimeterX < …` — the same
30//! order the enum source declares. The order is stable across
31//! releases and across the
32//! [`Ord`][std::cmp::Ord] implementation derived on [`VendorId`].
33//!
34//! ## High-confidence threshold
35//!
36//! The classifier carries a configurable threshold
37//! [`DEFAULT_HIGH_CONFIDENCE_THRESHOLD`] (0.60). The
38//! [`VendorClassification::is_high_confidence`] flag is set when
39//! the top vendor's confidence crosses the threshold. Callers can
40//! override the threshold via
41//! [`VendorClassifier::with_threshold`].
42//!
43//! # Example
44//!
45//! ```
46//! use stygian_charon::vendor_classifier::{VendorClassifier, VendorId, EvidenceSource};
47//! use std::collections::BTreeMap;
48//!
49//! let classifier = VendorClassifier::with_builtin_defaults();
50//! let mut headers = BTreeMap::new();
51//! headers.insert("cf-ray".to_string(), "abc-ORD".to_string());
52//! headers.insert("server".to_string(), "cloudflare".to_string());
53//! let cookies = vec!["__cf_bm=xyz; path=/".to_string()];
54//! let body = "Attention required! | cloudflare".to_string();
55//! let url = "https://example.com/cdn-cgi/challenge-platform";
56//!
57//! let classification = classifier.classify(&cookies, &headers, Some(&body), url);
58//! assert_eq!(classification.top_vendor, VendorId::Cloudflare);
59//! assert!(classification.is_high_confidence);
60//! assert!(classification.confidence > 0.0);
61//! ```
62
63use std::collections::BTreeMap;
64
65use serde::{Deserialize, Serialize};
66
67use crate::har;
68use crate::types::TransactionView;
69use crate::vendor_classifier::evidence::{Evidence, EvidenceBundle, EvidenceSource};
70use crate::vendor_classifier::vendor::{VendorDefinition, VendorId};
71
72/// Default confidence threshold for the
73/// [`VendorClassification::is_high_confidence`] flag.
74///
75/// Callers can override the threshold via
76/// [`VendorClassifier::with_threshold`]. Values outside the
77/// `(0.0, 1.0]` range fall back to this default.
78pub const DEFAULT_HIGH_CONFIDENCE_THRESHOLD: f64 = 0.60;
79
80/// Maximum confidence (used when only one vendor matched).
81const FULL_CONFIDENCE: f64 = 1.0;
82
83/// Per-vendor scorecard returned by the classifier.
84///
85/// A `VendorScore` records the **total weighted signal count** for
86/// a single vendor along with the evidence that contributed. The
87/// scores are returned in **rank order** (top first).
88///
89/// # Example
90///
91/// ```
92/// use stygian_charon::vendor_classifier::{EvidenceSource, VendorId, VendorScore};
93///
94/// let score = VendorScore {
95/// vendor: VendorId::Cloudflare,
96/// score: 10,
97/// matched_sources: vec![(EvidenceSource::Header, 2), (EvidenceSource::Cookie, 1)]
98/// .into_iter()
99/// .collect(),
100/// };
101/// assert_eq!(score.vendor, VendorId::Cloudflare);
102/// assert_eq!(score.score, 10);
103/// ```
104#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
105pub struct VendorScore {
106 /// Vendor this score belongs to.
107 pub vendor: VendorId,
108 /// Sum of the matched signal weights.
109 pub score: u32,
110 /// Per-source count of matched signals (`BTreeMap` keeps the
111 /// output deterministic).
112 pub matched_sources: BTreeMap<EvidenceSource, usize>,
113}
114
115impl VendorScore {
116 /// `true` when this score reflects a real (non-zero) match.
117 #[must_use]
118 pub const fn is_match(&self) -> bool {
119 self.score > 0
120 }
121}
122
123/// Full vendor classification output.
124///
125/// Carries the **ranked scoreboard**, the **top vendor** (the
126/// confidence-bearing winner), the **confidence** in the top
127/// vendor, the **evidence bundle** the score was computed from,
128/// and the **high-confidence flag** the operator-facing policy
129/// layer reads to decide whether to escalate.
130///
131/// # Example
132///
133/// ```
134/// use stygian_charon::vendor_classifier::{VendorClassification, VendorId, EvidenceBundle};
135///
136/// let classification = VendorClassification {
137/// top_vendor: VendorId::Cloudflare,
138/// confidence: 0.85,
139/// is_high_confidence: true,
140/// ranked: Vec::new(),
141/// evidence: EvidenceBundle::default(),
142/// threshold: 0.60,
143/// };
144/// assert_eq!(classification.top_vendor, VendorId::Cloudflare);
145/// assert!(classification.is_high_confidence);
146/// ```
147#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
148pub struct VendorClassification {
149 /// Vendor with the highest (deterministically tie-broken) score.
150 pub top_vendor: VendorId,
151 /// Confidence in `top_vendor` in `[0.0, 1.0]`.
152 pub confidence: f64,
153 /// `true` when `confidence >= threshold` (the "high confidence"
154 /// policy-routing flag).
155 pub is_high_confidence: bool,
156 /// Ranked scoreboard (top first).
157 pub ranked: Vec<VendorScore>,
158 /// Full evidence bundle the score was computed from.
159 pub evidence: EvidenceBundle,
160 /// Threshold the `is_high_confidence` flag was evaluated against.
161 pub threshold: f64,
162}
163
164impl VendorClassification {
165 /// `true` when at least one vendor-specific signal matched.
166 #[must_use]
167 pub fn is_identified(&self) -> bool {
168 self.top_vendor != VendorId::Unknown
169 }
170
171 /// `true` when the classification is a clean "no vendor"
172 /// signal (no evidence at all).
173 #[must_use]
174 pub fn is_unknown(&self) -> bool {
175 self.top_vendor == VendorId::Unknown && self.confidence == 0.0
176 }
177}
178
179/// Vendor-classification engine.
180///
181/// Construct with [`VendorClassifier::with_builtin_defaults`] to
182/// load the four baseline Tier 1 vendor definitions shipped in
183/// `crates/stygian-charon/data/vendors/`, or
184/// [`VendorClassifier::new`] for an empty / custom registry.
185///
186/// The classifier is **stateless** and `Send + Sync` so it can be
187/// shared across threads and requests without locking.
188///
189/// # Example
190///
191/// ```
192/// use stygian_charon::vendor_classifier::{VendorClassifier, VendorId};
193/// use std::collections::BTreeMap;
194///
195/// let empty = VendorClassifier::new(Vec::new());
196/// let cookies: Vec<String> = Vec::new();
197/// let headers: BTreeMap<String, String> = BTreeMap::new();
198/// let classification = empty.classify(&cookies, &headers, None, "https://example.com/");
199/// assert_eq!(classification.top_vendor, VendorId::Unknown);
200/// assert!(classification.is_unknown());
201/// ```
202#[derive(Debug, Clone)]
203pub struct VendorClassifier {
204 definitions: Vec<VendorDefinition>,
205 threshold: f64,
206}
207
208impl VendorClassifier {
209 /// Build a classifier from a pre-loaded list of
210 /// [`VendorDefinition`] entries.
211 ///
212 /// The threshold defaults to
213 /// [`DEFAULT_HIGH_CONFIDENCE_THRESHOLD`]. Override with
214 /// [`with_threshold`][Self::with_threshold].
215 #[must_use]
216 pub const fn new(definitions: Vec<VendorDefinition>) -> Self {
217 Self {
218 definitions,
219 threshold: DEFAULT_HIGH_CONFIDENCE_THRESHOLD,
220 }
221 }
222
223 /// Build a classifier seeded with the four baseline Tier 1
224 /// vendor definitions embedded at compile time from
225 /// `crates/stygian-charon/data/vendors/`.
226 ///
227 /// The compile-time check
228 /// `compile_check_builtin_vendors`
229 /// guarantees that every embedded TOML is valid; if it
230 /// regresses, the build will fail.
231 ///
232 /// # Example
233 ///
234 /// ```
235 /// use stygian_charon::vendor_classifier::{VendorClassifier, VendorId};
236 ///
237 /// let classifier = VendorClassifier::with_builtin_defaults();
238 /// assert!(classifier.contains(VendorId::DataDome));
239 /// assert!(classifier.contains(VendorId::PerimeterX));
240 /// assert!(classifier.contains(VendorId::Akamai));
241 /// assert!(classifier.contains(VendorId::Cloudflare));
242 /// ```
243 #[must_use]
244 pub fn with_builtin_defaults() -> Self {
245 let definitions = crate::vendor_classifier::builtins::builtin_vendors();
246 Self::new(definitions)
247 }
248
249 /// Override the high-confidence threshold. The supplied value
250 /// is clamped to `(0.0, 1.0]`. Non-finite values (`NaN`,
251 /// `±∞`) fall back to
252 /// [`DEFAULT_HIGH_CONFIDENCE_THRESHOLD`].
253 ///
254 /// # Example
255 ///
256 /// ```
257 /// use stygian_charon::vendor_classifier::{VendorClassifier, DEFAULT_HIGH_CONFIDENCE_THRESHOLD};
258 ///
259 /// let classifier = VendorClassifier::new(Vec::new()).with_threshold(0.85);
260 /// assert!((classifier.threshold() - 0.85).abs() < 1e-9);
261 ///
262 /// // Out-of-range values clamp to the default.
263 /// let reset = VendorClassifier::new(Vec::new()).with_threshold(f64::NAN);
264 /// assert!((reset.threshold() - DEFAULT_HIGH_CONFIDENCE_THRESHOLD).abs() < 1e-9);
265 /// ```
266 #[must_use]
267 pub fn with_threshold(mut self, threshold: f64) -> Self {
268 self.threshold = if threshold.is_finite() && threshold > 0.0 && threshold <= 1.0 {
269 threshold
270 } else {
271 DEFAULT_HIGH_CONFIDENCE_THRESHOLD
272 };
273 self
274 }
275
276 /// Configured high-confidence threshold.
277 #[must_use]
278 pub const fn threshold(&self) -> f64 {
279 self.threshold
280 }
281
282 /// `true` when the registry contains a definition for the
283 /// given [`VendorId`].
284 #[must_use]
285 pub fn contains(&self, vendor: VendorId) -> bool {
286 self.definitions.iter().any(|d| d.id == vendor)
287 }
288
289 /// Number of vendor definitions currently registered.
290 #[must_use]
291 pub const fn len(&self) -> usize {
292 self.definitions.len()
293 }
294
295 /// `true` when the registry has no definitions.
296 #[must_use]
297 pub const fn is_empty(&self) -> bool {
298 self.definitions.is_empty()
299 }
300
301 /// Classify a single set of input strings (cookies, headers,
302 /// optional body, request URL) into a ranked vendor
303 /// classification.
304 ///
305 /// The classifier scans every registered
306 /// [`VendorDefinition`]'s signal catalogue and computes a
307 /// per-vendor weighted score. The match is case-insensitive
308 /// (definitions are lower-cased at load time, and the input
309 /// strings are lower-cased at the match site).
310 /// strings are lower-cased at the match site).
311 ///
312 /// # Determinism
313 ///
314 /// - Signals are matched in `(source, pattern)` lex order.
315 /// - Ties on the top score are broken by
316 /// [`VendorId`] discriminant order (see module docs).
317 /// - The output is `Send + Sync` and contains no
318 /// `HashMap`/`HashSet` so the JSON form is byte-stable.
319 #[must_use]
320 pub fn classify(
321 &self,
322 cookies: &[String],
323 headers: &BTreeMap<String, String>,
324 body: Option<&str>,
325 url: &str,
326 ) -> VendorClassification {
327 let mut evidence_items: Vec<Evidence> = Vec::new();
328 let mut scores: BTreeMap<VendorId, VendorScore> = BTreeMap::new();
329
330 for def in &self.definitions {
331 let score = score_definition(def, cookies, headers, body, url, &mut evidence_items);
332 scores.insert(
333 def.id,
334 VendorScore {
335 vendor: def.id,
336 score,
337 matched_sources: BTreeMap::new(),
338 },
339 );
340 }
341
342 // Precompute the per-source count summaries.
343 let mut ranked: Vec<VendorScore> = scores.into_values().collect();
344 for score in &mut ranked {
345 let mut per_source: BTreeMap<EvidenceSource, usize> = BTreeMap::new();
346 for ev in evidence_items.iter().filter(|e| {
347 self.definitions
348 .iter()
349 .find(|d| d.id == score.vendor)
350 .is_some_and(|d| {
351 // Compound (pattern, source) key match: the
352 // vendor's pattern `s.pattern` is compared
353 // against the matched literal `e.signal` plus
354 // the channel `e.source` — the same-name
355 // comparison is intentional, not a typo.
356 #[allow(clippy::suspicious_operation_groupings)]
357 d.signals
358 .iter()
359 .any(|s| s.pattern == e.signal && s.source == e.source)
360 })
361 }) {
362 *per_source.entry(ev.source).or_insert(0) += 1;
363 }
364 score.matched_sources = per_source;
365 }
366
367 // Rank: descending score, then ascending VendorId (the
368 // deterministic tie-break rule).
369 ranked.sort_by(|a, b| b.score.cmp(&a.score).then_with(|| a.vendor.cmp(&b.vendor)));
370
371 let (top, second) = match ranked.as_slice() {
372 [] => (None, None),
373 [single] => (Some(single), None),
374 [first, rest @ ..] => (Some(first), rest.first()),
375 };
376
377 let (top_vendor, confidence) = match (top, second) {
378 (Some(primary), Some(secondary)) if primary.score > 0 => {
379 let denom = u64::from(primary.score) + u64::from(secondary.score);
380 let conf = if denom == 0 {
381 0.0
382 } else {
383 // u32 scores are well within the f64 mantissa
384 // (max ~4.3B), so the precision loss is
385 // bounded and intentional.
386 #[allow(clippy::cast_precision_loss)]
387 let result = f64::from(primary.score) / (denom as f64);
388 result
389 };
390 (primary.vendor, conf)
391 }
392 (Some(primary), _) if primary.score > 0 => (primary.vendor, FULL_CONFIDENCE),
393 _ => (VendorId::Unknown, 0.0),
394 };
395
396 let is_high_confidence = confidence >= self.threshold;
397
398 let mut source_summary: BTreeMap<EvidenceSource, usize> = BTreeMap::new();
399 for ev in &evidence_items {
400 *source_summary.entry(ev.source).or_insert(0) += 1;
401 }
402 let evidence = EvidenceBundle {
403 items: evidence_items,
404 source_summary,
405 };
406
407 VendorClassification {
408 top_vendor,
409 confidence,
410 is_high_confidence,
411 ranked,
412 evidence,
413 threshold: self.threshold,
414 }
415 }
416
417 /// Convenience wrapper around
418 /// [`classify`][Self::classify] that pulls the inputs out of a
419 /// [`TransactionView`].
420 ///
421 /// Cookies are extracted from the `set-cookie` / `cookie`
422 /// response header (everything else is treated as a generic
423 /// header). The body is the `response_body_snippet`. The URL
424 /// is `tx.url`.
425 #[must_use]
426 pub fn classify_view(&self, tx: &TransactionView) -> VendorClassification {
427 let cookies = extract_cookies(&tx.response_headers);
428 self.classify(
429 &cookies,
430 &tx.response_headers,
431 tx.response_body_snippet.as_deref(),
432 &tx.url,
433 )
434 }
435
436 /// Classify every transaction in a HAR payload and return the
437 /// top vendor's classification. Cookies, headers, and body
438 /// snippets are pulled from each HAR entry directly.
439 ///
440 /// # Errors
441 ///
442 /// Returns [`har::HarError`] when the HAR JSON is invalid or
443 /// exceeds a configured safety limit.
444 pub fn classify_har(&self, har_json: &str) -> Result<VendorClassification, har::HarError> {
445 let parsed = har::parse_har_transactions(har_json)?;
446 // Each transaction is classified independently; the
447 // **final** classification is the one with the highest
448 // confidence. This keeps the output focused on the
449 // strongest single piece of evidence (typically the
450 // challenge response, which is a single transaction in a
451 // capture).
452 let mut best: Option<VendorClassification> = None;
453 for entry in parsed.requests {
454 let view: TransactionView = entry.into();
455 let classification = self.classify_view(&view);
456 // Higher confidence wins; ties broken by the
457 // deterministic `VendorId` order (lower discriminant
458 // wins). The float comparison is intentional — the
459 // confidence is derived deterministically from the
460 // weighted scoreboard, so equality is meaningful.
461 #[allow(clippy::float_cmp)]
462 let is_better = match &best {
463 None => true,
464 Some(prev) => {
465 classification.confidence > prev.confidence
466 || (classification.confidence == prev.confidence
467 && classification.top_vendor < prev.top_vendor)
468 }
469 };
470 if is_better {
471 best = Some(classification);
472 }
473 }
474 Ok(best.unwrap_or_else(|| VendorClassification {
475 top_vendor: VendorId::Unknown,
476 confidence: 0.0,
477 is_high_confidence: false,
478 ranked: Vec::new(),
479 evidence: EvidenceBundle::default(),
480 threshold: self.threshold,
481 }))
482 }
483}
484
485fn score_definition(
486 def: &VendorDefinition,
487 cookies: &[String],
488 headers: &BTreeMap<String, String>,
489 body: Option<&str>,
490 url: &str,
491 evidence: &mut Vec<Evidence>,
492) -> u32 {
493 let mut total: u32 = 0;
494 let body_lower = body.map(str::to_ascii_lowercase);
495 let url_lower = url.to_ascii_lowercase();
496 let grouped = def.signals_by_source();
497
498 for (source, signals) in &grouped {
499 match source {
500 EvidenceSource::Cookie => {
501 for cookie in cookies {
502 let lower = cookie.to_ascii_lowercase();
503 for sig in signals {
504 if lower.contains(&sig.pattern) {
505 total = total.saturating_add(sig.weight);
506 evidence.push(Evidence {
507 signal: sig.pattern.clone(),
508 source: EvidenceSource::Cookie,
509 weight: sig.weight,
510 });
511 }
512 }
513 }
514 }
515 EvidenceSource::Header => {
516 for (name, value) in headers {
517 // Skip the `set-cookie` / `cookie` headers —
518 // they are scored as cookies, not generic
519 // headers, to avoid double-counting the same
520 // signal in two sources.
521 let lower_name = name.to_ascii_lowercase();
522 if lower_name == "set-cookie" || lower_name == "cookie" {
523 continue;
524 }
525 let haystack = format!("{lower_name}:{}", value.to_ascii_lowercase());
526 for sig in signals {
527 if haystack.contains(&sig.pattern) {
528 total = total.saturating_add(sig.weight);
529 evidence.push(Evidence {
530 signal: sig.pattern.clone(),
531 source: EvidenceSource::Header,
532 weight: sig.weight,
533 });
534 }
535 }
536 }
537 }
538 EvidenceSource::ChallengeUrl => {
539 for sig in signals {
540 if url_lower.contains(&sig.pattern) {
541 total = total.saturating_add(sig.weight);
542 evidence.push(Evidence {
543 signal: sig.pattern.clone(),
544 source: EvidenceSource::ChallengeUrl,
545 weight: sig.weight,
546 });
547 }
548 }
549 }
550 EvidenceSource::BodyMarker => {
551 if let Some(body) = &body_lower {
552 for sig in signals {
553 if body.contains(&sig.pattern) {
554 total = total.saturating_add(sig.weight);
555 evidence.push(Evidence {
556 signal: sig.pattern.clone(),
557 source: EvidenceSource::BodyMarker,
558 weight: sig.weight,
559 });
560 }
561 }
562 }
563 }
564 EvidenceSource::Script => {
565 // The classifier does not currently surface a
566 // separate script snippet, so the `script` source
567 // folds into the body marker matching. This keeps
568 // the public API stable: a future `script` field
569 // on the classifier input can be added without
570 // changing the wire format.
571 if let Some(body) = &body_lower {
572 for sig in signals {
573 if body.contains(&sig.pattern) {
574 total = total.saturating_add(sig.weight);
575 evidence.push(Evidence {
576 signal: sig.pattern.clone(),
577 source: EvidenceSource::Script,
578 weight: sig.weight,
579 });
580 }
581 }
582 }
583 }
584 }
585 }
586
587 // De-duplicate evidence rows that came from the same
588 // pattern + source pair (e.g. the same cookie value
589 // appearing in multiple header rows). Keeping one row per
590 // (source, pattern) preserves the audit trail without
591 // double-counting.
592 evidence.sort_by(|a, b| (a.source, &a.signal).cmp(&(b.source, &b.signal)));
593 evidence.dedup_by(|a, b| a.source == b.source && a.signal == b.signal);
594
595 total
596}
597
598fn extract_cookies(headers: &BTreeMap<String, String>) -> Vec<String> {
599 let mut out: Vec<String> = Vec::new();
600 for (name, value) in headers {
601 let lower = name.to_ascii_lowercase();
602 if lower == "set-cookie" || lower == "cookie" {
603 out.push(value.clone());
604 }
605 }
606 out
607}
608
609#[cfg(test)]
610#[allow(
611 clippy::unwrap_used,
612 clippy::expect_used,
613 clippy::panic,
614 clippy::indexing_slicing
615)]
616mod tests {
617 use std::collections::BTreeMap;
618
619 use super::*;
620 use crate::vendor_classifier::evidence::EvidenceSource;
621 use crate::vendor_classifier::vendor::VendorSignal;
622
623 fn datadome_definition() -> VendorDefinition {
624 VendorDefinition {
625 id: VendorId::DataDome,
626 display_name: "DataDome".to_string(),
627 description: String::new(),
628 tier: 1,
629 signals: vec![VendorSignal {
630 pattern: "x-datadome".to_string(),
631 source: EvidenceSource::Header,
632 weight: 5,
633 }],
634 }
635 }
636
637 fn cloudflare_definition() -> VendorDefinition {
638 VendorDefinition {
639 id: VendorId::Cloudflare,
640 display_name: "Cloudflare".to_string(),
641 description: String::new(),
642 tier: 1,
643 signals: vec![VendorSignal {
644 pattern: "cf-ray".to_string(),
645 source: EvidenceSource::Header,
646 weight: 5,
647 }],
648 }
649 }
650
651 fn empty_classifier() -> VendorClassifier {
652 VendorClassifier::new(Vec::new())
653 }
654
655 #[test]
656 fn empty_classifier_reports_unknown() {
657 let classification =
658 empty_classifier().classify(&[], &BTreeMap::new(), None, "https://example.com/");
659 assert_eq!(classification.top_vendor, VendorId::Unknown);
660 assert!(classification.is_unknown());
661 assert!(!classification.is_high_confidence);
662 assert!(classification.evidence.is_empty());
663 assert!(classification.ranked.is_empty());
664 }
665
666 #[test]
667 fn single_vendor_match_with_one_signal_above_threshold() {
668 let classifier = VendorClassifier::new(vec![datadome_definition()]).with_threshold(0.60);
669 let mut headers = BTreeMap::new();
670 headers.insert("x-datadome".to_string(), "protected".to_string());
671 let classification = classifier.classify(&[], &headers, None, "https://example.com/");
672 assert_eq!(classification.top_vendor, VendorId::DataDome);
673 assert!((classification.confidence - 1.0).abs() < 1e-9);
674 assert!(classification.is_high_confidence);
675 assert_eq!(classification.evidence.items.len(), 1);
676 assert_eq!(
677 classification.evidence.items[0].source,
678 EvidenceSource::Header
679 );
680 }
681
682 #[test]
683 fn multi_vendor_match_ranks_by_score_with_deterministic_tie_break() {
684 let classifier =
685 VendorClassifier::new(vec![datadome_definition(), cloudflare_definition()]);
686 let mut headers = BTreeMap::new();
687 // Both vendors score 5 from their respective signals.
688 headers.insert("x-datadome".to_string(), "1".to_string());
689 headers.insert("cf-ray".to_string(), "1".to_string());
690 let classification = classifier.classify(&[], &headers, None, "https://example.com/");
691 // Tie-break: Akamai (0) < Cloudflare (1) < DataDome (2) < PerimeterX (3).
692 // We have Cloudflare (1) and DataDome (2) tied at 5; DataDome is
693 // declared later in the registry *and* has a higher discriminant,
694 // so Cloudflare wins on the VendorId order tie-break.
695 assert_eq!(classification.top_vendor, VendorId::Cloudflare);
696 // Confidence = top / (top + second) = 5 / (5 + 5) = 0.5
697 assert!((classification.confidence - 0.5).abs() < 1e-9);
698 assert!(!classification.is_high_confidence);
699 }
700
701 #[test]
702 fn below_threshold_classification_is_not_high_confidence() {
703 let classifier = VendorClassifier::new(vec![datadome_definition()]).with_threshold(0.99);
704 let mut headers = BTreeMap::new();
705 headers.insert("x-datadome".to_string(), "1".to_string());
706 let classification = classifier.classify(&[], &headers, None, "https://example.com/");
707 // Single-vendor match still has confidence 1.0, so the
708 // only way to push it below threshold is via a multi-
709 // vendor split.
710 let two = VendorClassifier::new(vec![datadome_definition(), cloudflare_definition()])
711 .with_threshold(0.99);
712 let mut headers2 = BTreeMap::new();
713 headers2.insert("x-datadome".to_string(), "1".to_string());
714 headers2.insert("cf-ray".to_string(), "1".to_string());
715 let c2 = two.classify(&[], &headers2, None, "https://example.com/");
716 assert!(!c2.is_high_confidence);
717 // Sanity-check the value.
718 let _ = classification;
719 }
720
721 #[test]
722 fn cookies_are_extracted_from_set_cookie_header() {
723 let classifier = VendorClassifier::new(vec![VendorDefinition {
724 id: VendorId::DataDome,
725 display_name: "x".to_string(),
726 description: String::new(),
727 tier: 1,
728 signals: vec![VendorSignal {
729 pattern: "datadome=".to_string(),
730 source: EvidenceSource::Cookie,
731 weight: 5,
732 }],
733 }]);
734 // The classifier accepts a `cookies: &[String]` parameter
735 // directly; `classify_view` is the convenience wrapper
736 // that pulls cookies out of the `set-cookie` header.
737 let cookies = vec!["datadome=abc; Path=/".to_string()];
738 let classification =
739 classifier.classify(&cookies, &BTreeMap::new(), None, "https://example.com/");
740 assert_eq!(classification.top_vendor, VendorId::DataDome);
741 assert_eq!(classification.evidence.items.len(), 1);
742 assert_eq!(
743 classification.evidence.items[0].source,
744 EvidenceSource::Cookie
745 );
746 }
747
748 #[test]
749 fn classify_view_extracts_cookies_from_set_cookie_header() {
750 let classifier = VendorClassifier::new(vec![VendorDefinition {
751 id: VendorId::DataDome,
752 display_name: "x".to_string(),
753 description: String::new(),
754 tier: 1,
755 signals: vec![VendorSignal {
756 pattern: "datadome=".to_string(),
757 source: EvidenceSource::Cookie,
758 weight: 5,
759 }],
760 }]);
761 let mut headers = BTreeMap::new();
762 headers.insert("set-cookie".to_string(), "datadome=abc; Path=/".to_string());
763 let tx = TransactionView {
764 url: "https://example.com/".to_string(),
765 status: 403,
766 response_headers: headers,
767 response_body_snippet: None,
768 };
769 let classification = classifier.classify_view(&tx);
770 assert_eq!(classification.top_vendor, VendorId::DataDome);
771 assert_eq!(
772 classification.evidence.items[0].source,
773 EvidenceSource::Cookie
774 );
775 }
776
777 #[test]
778 fn body_markers_match_case_insensitively() {
779 let classifier = VendorClassifier::new(vec![VendorDefinition {
780 id: VendorId::Cloudflare,
781 display_name: "x".to_string(),
782 description: String::new(),
783 tier: 1,
784 signals: vec![VendorSignal {
785 pattern: "attention required! | cloudflare".to_string(),
786 source: EvidenceSource::BodyMarker,
787 weight: 4,
788 }],
789 }]);
790 let body = "<h1>Attention Required! | Cloudflare</h1>";
791 let classification = classifier.classify(&[], &BTreeMap::new(), Some(body), "https://x/");
792 assert_eq!(classification.top_vendor, VendorId::Cloudflare);
793 assert_eq!(
794 classification.evidence.items[0].source,
795 EvidenceSource::BodyMarker
796 );
797 }
798
799 #[test]
800 fn challenge_url_signal_matches_path_segments() {
801 let classifier = VendorClassifier::new(vec![VendorDefinition {
802 id: VendorId::Cloudflare,
803 display_name: "x".to_string(),
804 description: String::new(),
805 tier: 1,
806 signals: vec![VendorSignal {
807 pattern: "cdn-cgi/challenge-platform".to_string(),
808 source: EvidenceSource::ChallengeUrl,
809 weight: 4,
810 }],
811 }]);
812 let url = "https://example.com/cdn-cgi/challenge-platform/orchestrate/jschl/abc";
813 let classification = classifier.classify(&[], &BTreeMap::new(), None, url);
814 assert_eq!(classification.top_vendor, VendorId::Cloudflare);
815 assert_eq!(
816 classification.evidence.items[0].source,
817 EvidenceSource::ChallengeUrl
818 );
819 }
820
821 #[test]
822 fn classify_view_pulls_inputs_from_transaction() {
823 let classifier = VendorClassifier::new(vec![datadome_definition()]);
824 let mut headers = BTreeMap::new();
825 headers.insert("x-datadome".to_string(), "1".to_string());
826 let tx = TransactionView {
827 url: "https://example.com/".to_string(),
828 status: 403,
829 response_headers: headers,
830 response_body_snippet: None,
831 };
832 let c = classifier.classify_view(&tx);
833 assert_eq!(c.top_vendor, VendorId::DataDome);
834 }
835
836 #[test]
837 fn threshold_validation_falls_back_to_default() {
838 let classifier = VendorClassifier::new(Vec::new()).with_threshold(f64::NAN);
839 assert!((classifier.threshold() - DEFAULT_HIGH_CONFIDENCE_THRESHOLD).abs() < 1e-9);
840 let negative = VendorClassifier::new(Vec::new()).with_threshold(-1.0);
841 assert!((negative.threshold() - DEFAULT_HIGH_CONFIDENCE_THRESHOLD).abs() < 1e-9);
842 let above = VendorClassifier::new(Vec::new()).with_threshold(1.5);
843 assert!((above.threshold() - DEFAULT_HIGH_CONFIDENCE_THRESHOLD).abs() < 1e-9);
844 }
845
846 #[test]
847 fn vendor_id_discriminant_order_breaks_ties() {
848 // The order of variants in the `VendorId` enum
849 // determines tie-break: Akamai (0) < Cloudflare (1) <
850 // DataDome (2) < PerimeterX (3).
851 let classifier = VendorClassifier::new(vec![
852 VendorDefinition {
853 id: VendorId::Akamai,
854 display_name: "x".to_string(),
855 description: String::new(),
856 tier: 1,
857 signals: vec![VendorSignal {
858 pattern: "tied".to_string(),
859 source: EvidenceSource::BodyMarker,
860 weight: 5,
861 }],
862 },
863 VendorDefinition {
864 id: VendorId::PerimeterX,
865 display_name: "x".to_string(),
866 description: String::new(),
867 tier: 1,
868 signals: vec![VendorSignal {
869 pattern: "tied".to_string(),
870 source: EvidenceSource::BodyMarker,
871 weight: 5,
872 }],
873 },
874 ]);
875 let body = "this body contains the tied marker";
876 let c = classifier.classify(&[], &BTreeMap::new(), Some(body), "https://x/");
877 // Both score 5; lower VendorId discriminant wins.
878 assert_eq!(c.top_vendor, VendorId::Akamai);
879 }
880
881 #[test]
882 fn builtin_classifier_includes_all_tier1_vendors() {
883 let classifier = VendorClassifier::with_builtin_defaults();
884 assert!(classifier.contains(VendorId::DataDome));
885 assert!(classifier.contains(VendorId::PerimeterX));
886 assert!(classifier.contains(VendorId::Akamai));
887 assert!(classifier.contains(VendorId::Cloudflare));
888 }
889
890 #[test]
891 fn builtin_classifier_detects_cloudflare_in_realistic_input() {
892 let classifier = VendorClassifier::with_builtin_defaults();
893 let mut headers = BTreeMap::new();
894 headers.insert("cf-ray".to_string(), "abc-ORD".to_string());
895 headers.insert("server".to_string(), "cloudflare".to_string());
896 let cookies = vec!["__cf_bm=xyz; path=/".to_string()];
897 let body = "Attention required! | cloudflare";
898 let url = "https://example.com/cdn-cgi/challenge-platform/orchestrate";
899 let c = classifier.classify(&cookies, &headers, Some(body), url);
900 assert_eq!(c.top_vendor, VendorId::Cloudflare);
901 assert!(c.is_high_confidence);
902 assert!(c.confidence > 0.0);
903 // Per-source summary should record at least one of each source.
904 assert!(
905 c.evidence
906 .source_summary
907 .contains_key(&EvidenceSource::Header)
908 );
909 assert!(
910 c.evidence
911 .source_summary
912 .contains_key(&EvidenceSource::Cookie)
913 );
914 assert!(
915 c.evidence
916 .source_summary
917 .contains_key(&EvidenceSource::BodyMarker)
918 );
919 assert!(
920 c.evidence
921 .source_summary
922 .contains_key(&EvidenceSource::ChallengeUrl)
923 );
924 }
925}