Skip to main content

stygian_charon/vendor_classifier/
classifier.rs

1//! Vendor classification engine (T89).
2//!
3//! The [`VendorClassifier`] is a deterministic, evidence-emitting
4//! ranker that consumes cookies, headers, challenge URLs, and body
5//! markers and produces a ranked vendor scoreboard. It is the
6//! primary input to T88 (anti-bot change-detection feed) and T90
7//! (vendor-to-playbook auto-resolution).
8//!
9//! ## Confidence formula
10//!
11//! For each [`VendorDefinition`] the classifier sums the weights
12//! of the matched signals. The **top vendor**'s confidence is then
13//!
14//! ```text
15//! confidence = top_score / (top_score + second_score)
16//! ```
17//!
18//! which is the same Jaccard-style ratio the existing
19//! [`crate::classifier::classify_transaction`] uses. When only one
20//! vendor matched, `confidence = 1.0`. When no vendor matched, the
21//! classification is reported as [`VendorId::Unknown`] with
22//! `confidence = 0.0`.
23//!
24//! ## Deterministic tie-break rule
25//!
26//! When two or more vendors tie on the **same top score**, the
27//! tie is broken by [`VendorId`] discriminant order: the variant
28//! declared **earlier** in the enum wins. This means
29//! `Akamai < Cloudflare < DataDome < PerimeterX < …` — the same
30//! order the enum source declares. The order is stable across
31//! releases and across the
32//! [`Ord`][std::cmp::Ord] implementation derived on [`VendorId`].
33//!
34//! ## High-confidence threshold
35//!
36//! The classifier carries a configurable threshold
37//! [`DEFAULT_HIGH_CONFIDENCE_THRESHOLD`] (0.60). The
38//! [`VendorClassification::is_high_confidence`] flag is set when
39//! the top vendor's confidence crosses the threshold. Callers can
40//! override the threshold via
41//! [`VendorClassifier::with_threshold`].
42//!
43//! # Example
44//!
45//! ```
46//! use stygian_charon::vendor_classifier::{VendorClassifier, VendorId, EvidenceSource};
47//! use std::collections::BTreeMap;
48//!
49//! let classifier = VendorClassifier::with_builtin_defaults();
50//! let mut headers = BTreeMap::new();
51//! headers.insert("cf-ray".to_string(), "abc-ORD".to_string());
52//! headers.insert("server".to_string(), "cloudflare".to_string());
53//! let cookies = vec!["__cf_bm=xyz; path=/".to_string()];
54//! let body = "Attention required! | cloudflare".to_string();
55//! let url = "https://example.com/cdn-cgi/challenge-platform";
56//!
57//! let classification = classifier.classify(&cookies, &headers, Some(&body), url);
58//! assert_eq!(classification.top_vendor, VendorId::Cloudflare);
59//! assert!(classification.is_high_confidence);
60//! assert!(classification.confidence > 0.0);
61//! ```
62
63use std::collections::BTreeMap;
64
65use serde::{Deserialize, Serialize};
66
67use crate::har;
68use crate::types::TransactionView;
69use crate::vendor_classifier::evidence::{Evidence, EvidenceBundle, EvidenceSource};
70use crate::vendor_classifier::vendor::{VendorDefinition, VendorId};
71
72/// Default confidence threshold for the
73/// [`VendorClassification::is_high_confidence`] flag.
74///
75/// Callers can override the threshold via
76/// [`VendorClassifier::with_threshold`]. Values outside the
77/// `(0.0, 1.0]` range fall back to this default.
78pub const DEFAULT_HIGH_CONFIDENCE_THRESHOLD: f64 = 0.60;
79
80/// Maximum confidence (used when only one vendor matched).
81const FULL_CONFIDENCE: f64 = 1.0;
82
83/// Per-vendor scorecard returned by the classifier.
84///
85/// A `VendorScore` records the **total weighted signal count** for
86/// a single vendor along with the evidence that contributed. The
87/// scores are returned in **rank order** (top first).
88///
89/// # Example
90///
91/// ```
92/// use stygian_charon::vendor_classifier::{EvidenceSource, VendorId, VendorScore};
93///
94/// let score = VendorScore {
95///     vendor: VendorId::Cloudflare,
96///     score: 10,
97///     matched_sources: vec![(EvidenceSource::Header, 2), (EvidenceSource::Cookie, 1)]
98///         .into_iter()
99///         .collect(),
100/// };
101/// assert_eq!(score.vendor, VendorId::Cloudflare);
102/// assert_eq!(score.score, 10);
103/// ```
104#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
105pub struct VendorScore {
106    /// Vendor this score belongs to.
107    pub vendor: VendorId,
108    /// Sum of the matched signal weights.
109    pub score: u32,
110    /// Per-source count of matched signals (`BTreeMap` keeps the
111    /// output deterministic).
112    pub matched_sources: BTreeMap<EvidenceSource, usize>,
113}
114
115impl VendorScore {
116    /// `true` when this score reflects a real (non-zero) match.
117    #[must_use]
118    pub const fn is_match(&self) -> bool {
119        self.score > 0
120    }
121}
122
123/// Full vendor classification output.
124///
125/// Carries the **ranked scoreboard**, the **top vendor** (the
126/// confidence-bearing winner), the **confidence** in the top
127/// vendor, the **evidence bundle** the score was computed from,
128/// and the **high-confidence flag** the operator-facing policy
129/// layer reads to decide whether to escalate.
130///
131/// # Example
132///
133/// ```
134/// use stygian_charon::vendor_classifier::{VendorClassification, VendorId, EvidenceBundle};
135///
136/// let classification = VendorClassification {
137///     top_vendor: VendorId::Cloudflare,
138///     confidence: 0.85,
139///     is_high_confidence: true,
140///     ranked: Vec::new(),
141///     evidence: EvidenceBundle::default(),
142///     threshold: 0.60,
143/// };
144/// assert_eq!(classification.top_vendor, VendorId::Cloudflare);
145/// assert!(classification.is_high_confidence);
146/// ```
147#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
148pub struct VendorClassification {
149    /// Vendor with the highest (deterministically tie-broken) score.
150    pub top_vendor: VendorId,
151    /// Confidence in `top_vendor` in `[0.0, 1.0]`.
152    pub confidence: f64,
153    /// `true` when `confidence >= threshold` (the "high confidence"
154    /// policy-routing flag).
155    pub is_high_confidence: bool,
156    /// Ranked scoreboard (top first).
157    pub ranked: Vec<VendorScore>,
158    /// Full evidence bundle the score was computed from.
159    pub evidence: EvidenceBundle,
160    /// Threshold the `is_high_confidence` flag was evaluated against.
161    pub threshold: f64,
162}
163
164impl VendorClassification {
165    /// `true` when at least one vendor-specific signal matched.
166    #[must_use]
167    pub fn is_identified(&self) -> bool {
168        self.top_vendor != VendorId::Unknown
169    }
170
171    /// `true` when the classification is a clean "no vendor"
172    /// signal (no evidence at all).
173    #[must_use]
174    pub fn is_unknown(&self) -> bool {
175        self.top_vendor == VendorId::Unknown && self.confidence == 0.0
176    }
177}
178
179/// Vendor-classification engine.
180///
181/// Construct with [`VendorClassifier::with_builtin_defaults`] to
182/// load the four baseline Tier 1 vendor definitions shipped in
183/// `crates/stygian-charon/data/vendors/`, or
184/// [`VendorClassifier::new`] for an empty / custom registry.
185///
186/// The classifier is **stateless** and `Send + Sync` so it can be
187/// shared across threads and requests without locking.
188///
189/// # Example
190///
191/// ```
192/// use stygian_charon::vendor_classifier::{VendorClassifier, VendorId};
193/// use std::collections::BTreeMap;
194///
195/// let empty = VendorClassifier::new(Vec::new());
196/// let cookies: Vec<String> = Vec::new();
197/// let headers: BTreeMap<String, String> = BTreeMap::new();
198/// let classification = empty.classify(&cookies, &headers, None, "https://example.com/");
199/// assert_eq!(classification.top_vendor, VendorId::Unknown);
200/// assert!(classification.is_unknown());
201/// ```
202#[derive(Debug, Clone)]
203pub struct VendorClassifier {
204    definitions: Vec<VendorDefinition>,
205    threshold: f64,
206}
207
208impl VendorClassifier {
209    /// Build a classifier from a pre-loaded list of
210    /// [`VendorDefinition`] entries.
211    ///
212    /// The threshold defaults to
213    /// [`DEFAULT_HIGH_CONFIDENCE_THRESHOLD`]. Override with
214    /// [`with_threshold`][Self::with_threshold].
215    #[must_use]
216    pub const fn new(definitions: Vec<VendorDefinition>) -> Self {
217        Self {
218            definitions,
219            threshold: DEFAULT_HIGH_CONFIDENCE_THRESHOLD,
220        }
221    }
222
223    /// Build a classifier seeded with the four baseline Tier 1
224    /// vendor definitions embedded at compile time from
225    /// `crates/stygian-charon/data/vendors/`.
226    ///
227    /// The compile-time check
228    /// `compile_check_builtin_vendors`
229    /// guarantees that every embedded TOML is valid; if it
230    /// regresses, the build will fail.
231    ///
232    /// # Example
233    ///
234    /// ```
235    /// use stygian_charon::vendor_classifier::{VendorClassifier, VendorId};
236    ///
237    /// let classifier = VendorClassifier::with_builtin_defaults();
238    /// assert!(classifier.contains(VendorId::DataDome));
239    /// assert!(classifier.contains(VendorId::PerimeterX));
240    /// assert!(classifier.contains(VendorId::Akamai));
241    /// assert!(classifier.contains(VendorId::Cloudflare));
242    /// ```
243    #[must_use]
244    pub fn with_builtin_defaults() -> Self {
245        let definitions = crate::vendor_classifier::builtins::builtin_vendors();
246        Self::new(definitions)
247    }
248
249    /// Override the high-confidence threshold. The supplied value
250    /// is clamped to `(0.0, 1.0]`. Non-finite values (`NaN`,
251    /// `±∞`) fall back to
252    /// [`DEFAULT_HIGH_CONFIDENCE_THRESHOLD`].
253    ///
254    /// # Example
255    ///
256    /// ```
257    /// use stygian_charon::vendor_classifier::{VendorClassifier, DEFAULT_HIGH_CONFIDENCE_THRESHOLD};
258    ///
259    /// let classifier = VendorClassifier::new(Vec::new()).with_threshold(0.85);
260    /// assert!((classifier.threshold() - 0.85).abs() < 1e-9);
261    ///
262    /// // Out-of-range values clamp to the default.
263    /// let reset = VendorClassifier::new(Vec::new()).with_threshold(f64::NAN);
264    /// assert!((reset.threshold() - DEFAULT_HIGH_CONFIDENCE_THRESHOLD).abs() < 1e-9);
265    /// ```
266    #[must_use]
267    pub fn with_threshold(mut self, threshold: f64) -> Self {
268        self.threshold = if threshold.is_finite() && threshold > 0.0 && threshold <= 1.0 {
269            threshold
270        } else {
271            DEFAULT_HIGH_CONFIDENCE_THRESHOLD
272        };
273        self
274    }
275
276    /// Configured high-confidence threshold.
277    #[must_use]
278    pub const fn threshold(&self) -> f64 {
279        self.threshold
280    }
281
282    /// `true` when the registry contains a definition for the
283    /// given [`VendorId`].
284    #[must_use]
285    pub fn contains(&self, vendor: VendorId) -> bool {
286        self.definitions.iter().any(|d| d.id == vendor)
287    }
288
289    /// Number of vendor definitions currently registered.
290    #[must_use]
291    pub const fn len(&self) -> usize {
292        self.definitions.len()
293    }
294
295    /// `true` when the registry has no definitions.
296    #[must_use]
297    pub const fn is_empty(&self) -> bool {
298        self.definitions.is_empty()
299    }
300
301    /// Classify a single set of input strings (cookies, headers,
302    /// optional body, request URL) into a ranked vendor
303    /// classification.
304    ///
305    /// The classifier scans every registered
306    /// [`VendorDefinition`]'s signal catalogue and computes a
307    /// per-vendor weighted score. The match is case-insensitive
308    /// (definitions are lower-cased at load time, and the input
309    /// strings are lower-cased at the match site).
310    /// strings are lower-cased at the match site).
311    ///
312    /// # Determinism
313    ///
314    /// - Signals are matched in `(source, pattern)` lex order.
315    /// - Ties on the top score are broken by
316    ///   [`VendorId`] discriminant order (see module docs).
317    /// - The output is `Send + Sync` and contains no
318    ///   `HashMap`/`HashSet` so the JSON form is byte-stable.
319    #[must_use]
320    pub fn classify(
321        &self,
322        cookies: &[String],
323        headers: &BTreeMap<String, String>,
324        body: Option<&str>,
325        url: &str,
326    ) -> VendorClassification {
327        let mut evidence_items: Vec<Evidence> = Vec::new();
328        let mut scores: BTreeMap<VendorId, VendorScore> = BTreeMap::new();
329
330        for def in &self.definitions {
331            let score = score_definition(def, cookies, headers, body, url, &mut evidence_items);
332            scores.insert(
333                def.id,
334                VendorScore {
335                    vendor: def.id,
336                    score,
337                    matched_sources: BTreeMap::new(),
338                },
339            );
340        }
341
342        // Precompute the per-source count summaries.
343        let mut ranked: Vec<VendorScore> = scores.into_values().collect();
344        for score in &mut ranked {
345            let mut per_source: BTreeMap<EvidenceSource, usize> = BTreeMap::new();
346            for ev in evidence_items.iter().filter(|e| {
347                self.definitions
348                    .iter()
349                    .find(|d| d.id == score.vendor)
350                    .is_some_and(|d| {
351                        // Compound (pattern, source) key match: the
352                        // vendor's pattern `s.pattern` is compared
353                        // against the matched literal `e.signal` plus
354                        // the channel `e.source` — the same-name
355                        // comparison is intentional, not a typo.
356                        #[allow(clippy::suspicious_operation_groupings)]
357                        d.signals
358                            .iter()
359                            .any(|s| s.pattern == e.signal && s.source == e.source)
360                    })
361            }) {
362                *per_source.entry(ev.source).or_insert(0) += 1;
363            }
364            score.matched_sources = per_source;
365        }
366
367        // Rank: descending score, then ascending VendorId (the
368        // deterministic tie-break rule).
369        ranked.sort_by(|a, b| b.score.cmp(&a.score).then_with(|| a.vendor.cmp(&b.vendor)));
370
371        let (top, second) = match ranked.as_slice() {
372            [] => (None, None),
373            [single] => (Some(single), None),
374            [first, rest @ ..] => (Some(first), rest.first()),
375        };
376
377        let (top_vendor, confidence) = match (top, second) {
378            (Some(primary), Some(secondary)) if primary.score > 0 => {
379                let denom = u64::from(primary.score) + u64::from(secondary.score);
380                let conf = if denom == 0 {
381                    0.0
382                } else {
383                    // u32 scores are well within the f64 mantissa
384                    // (max ~4.3B), so the precision loss is
385                    // bounded and intentional.
386                    #[allow(clippy::cast_precision_loss)]
387                    let result = f64::from(primary.score) / (denom as f64);
388                    result
389                };
390                (primary.vendor, conf)
391            }
392            (Some(primary), _) if primary.score > 0 => (primary.vendor, FULL_CONFIDENCE),
393            _ => (VendorId::Unknown, 0.0),
394        };
395
396        let is_high_confidence = confidence >= self.threshold;
397
398        let mut source_summary: BTreeMap<EvidenceSource, usize> = BTreeMap::new();
399        for ev in &evidence_items {
400            *source_summary.entry(ev.source).or_insert(0) += 1;
401        }
402        let evidence = EvidenceBundle {
403            items: evidence_items,
404            source_summary,
405        };
406
407        VendorClassification {
408            top_vendor,
409            confidence,
410            is_high_confidence,
411            ranked,
412            evidence,
413            threshold: self.threshold,
414        }
415    }
416
417    /// Convenience wrapper around
418    /// [`classify`][Self::classify] that pulls the inputs out of a
419    /// [`TransactionView`].
420    ///
421    /// Cookies are extracted from the `set-cookie` / `cookie`
422    /// response header (everything else is treated as a generic
423    /// header). The body is the `response_body_snippet`. The URL
424    /// is `tx.url`.
425    #[must_use]
426    pub fn classify_view(&self, tx: &TransactionView) -> VendorClassification {
427        let cookies = extract_cookies(&tx.response_headers);
428        self.classify(
429            &cookies,
430            &tx.response_headers,
431            tx.response_body_snippet.as_deref(),
432            &tx.url,
433        )
434    }
435
436    /// Classify every transaction in a HAR payload and return the
437    /// top vendor's classification. Cookies, headers, and body
438    /// snippets are pulled from each HAR entry directly.
439    ///
440    /// # Errors
441    ///
442    /// Returns [`har::HarError`] when the HAR JSON is invalid or
443    /// exceeds a configured safety limit.
444    pub fn classify_har(&self, har_json: &str) -> Result<VendorClassification, har::HarError> {
445        let parsed = har::parse_har_transactions(har_json)?;
446        // Each transaction is classified independently; the
447        // **final** classification is the one with the highest
448        // confidence. This keeps the output focused on the
449        // strongest single piece of evidence (typically the
450        // challenge response, which is a single transaction in a
451        // capture).
452        let mut best: Option<VendorClassification> = None;
453        for entry in parsed.requests {
454            let view: TransactionView = entry.into();
455            let classification = self.classify_view(&view);
456            // Higher confidence wins; ties broken by the
457            // deterministic `VendorId` order (lower discriminant
458            // wins). The float comparison is intentional — the
459            // confidence is derived deterministically from the
460            // weighted scoreboard, so equality is meaningful.
461            #[allow(clippy::float_cmp)]
462            let is_better = match &best {
463                None => true,
464                Some(prev) => {
465                    classification.confidence > prev.confidence
466                        || (classification.confidence == prev.confidence
467                            && classification.top_vendor < prev.top_vendor)
468                }
469            };
470            if is_better {
471                best = Some(classification);
472            }
473        }
474        Ok(best.unwrap_or_else(|| VendorClassification {
475            top_vendor: VendorId::Unknown,
476            confidence: 0.0,
477            is_high_confidence: false,
478            ranked: Vec::new(),
479            evidence: EvidenceBundle::default(),
480            threshold: self.threshold,
481        }))
482    }
483}
484
485fn score_definition(
486    def: &VendorDefinition,
487    cookies: &[String],
488    headers: &BTreeMap<String, String>,
489    body: Option<&str>,
490    url: &str,
491    evidence: &mut Vec<Evidence>,
492) -> u32 {
493    let mut total: u32 = 0;
494    let body_lower = body.map(str::to_ascii_lowercase);
495    let url_lower = url.to_ascii_lowercase();
496    let grouped = def.signals_by_source();
497
498    for (source, signals) in &grouped {
499        match source {
500            EvidenceSource::Cookie => {
501                for cookie in cookies {
502                    let lower = cookie.to_ascii_lowercase();
503                    for sig in signals {
504                        if lower.contains(&sig.pattern) {
505                            total = total.saturating_add(sig.weight);
506                            evidence.push(Evidence {
507                                signal: sig.pattern.clone(),
508                                source: EvidenceSource::Cookie,
509                                weight: sig.weight,
510                            });
511                        }
512                    }
513                }
514            }
515            EvidenceSource::Header => {
516                for (name, value) in headers {
517                    // Skip the `set-cookie` / `cookie` headers —
518                    // they are scored as cookies, not generic
519                    // headers, to avoid double-counting the same
520                    // signal in two sources.
521                    let lower_name = name.to_ascii_lowercase();
522                    if lower_name == "set-cookie" || lower_name == "cookie" {
523                        continue;
524                    }
525                    let haystack = format!("{lower_name}:{}", value.to_ascii_lowercase());
526                    for sig in signals {
527                        if haystack.contains(&sig.pattern) {
528                            total = total.saturating_add(sig.weight);
529                            evidence.push(Evidence {
530                                signal: sig.pattern.clone(),
531                                source: EvidenceSource::Header,
532                                weight: sig.weight,
533                            });
534                        }
535                    }
536                }
537            }
538            EvidenceSource::ChallengeUrl => {
539                for sig in signals {
540                    if url_lower.contains(&sig.pattern) {
541                        total = total.saturating_add(sig.weight);
542                        evidence.push(Evidence {
543                            signal: sig.pattern.clone(),
544                            source: EvidenceSource::ChallengeUrl,
545                            weight: sig.weight,
546                        });
547                    }
548                }
549            }
550            EvidenceSource::BodyMarker => {
551                if let Some(body) = &body_lower {
552                    for sig in signals {
553                        if body.contains(&sig.pattern) {
554                            total = total.saturating_add(sig.weight);
555                            evidence.push(Evidence {
556                                signal: sig.pattern.clone(),
557                                source: EvidenceSource::BodyMarker,
558                                weight: sig.weight,
559                            });
560                        }
561                    }
562                }
563            }
564            EvidenceSource::Script => {
565                // The classifier does not currently surface a
566                // separate script snippet, so the `script` source
567                // folds into the body marker matching. This keeps
568                // the public API stable: a future `script` field
569                // on the classifier input can be added without
570                // changing the wire format.
571                if let Some(body) = &body_lower {
572                    for sig in signals {
573                        if body.contains(&sig.pattern) {
574                            total = total.saturating_add(sig.weight);
575                            evidence.push(Evidence {
576                                signal: sig.pattern.clone(),
577                                source: EvidenceSource::Script,
578                                weight: sig.weight,
579                            });
580                        }
581                    }
582                }
583            }
584        }
585    }
586
587    // De-duplicate evidence rows that came from the same
588    // pattern + source pair (e.g. the same cookie value
589    // appearing in multiple header rows). Keeping one row per
590    // (source, pattern) preserves the audit trail without
591    // double-counting.
592    evidence.sort_by(|a, b| (a.source, &a.signal).cmp(&(b.source, &b.signal)));
593    evidence.dedup_by(|a, b| a.source == b.source && a.signal == b.signal);
594
595    total
596}
597
598fn extract_cookies(headers: &BTreeMap<String, String>) -> Vec<String> {
599    let mut out: Vec<String> = Vec::new();
600    for (name, value) in headers {
601        let lower = name.to_ascii_lowercase();
602        if lower == "set-cookie" || lower == "cookie" {
603            out.push(value.clone());
604        }
605    }
606    out
607}
608
609#[cfg(test)]
610#[allow(
611    clippy::unwrap_used,
612    clippy::expect_used,
613    clippy::panic,
614    clippy::indexing_slicing
615)]
616mod tests {
617    use std::collections::BTreeMap;
618
619    use super::*;
620    use crate::vendor_classifier::evidence::EvidenceSource;
621    use crate::vendor_classifier::vendor::VendorSignal;
622
623    fn datadome_definition() -> VendorDefinition {
624        VendorDefinition {
625            id: VendorId::DataDome,
626            display_name: "DataDome".to_string(),
627            description: String::new(),
628            tier: 1,
629            signals: vec![VendorSignal {
630                pattern: "x-datadome".to_string(),
631                source: EvidenceSource::Header,
632                weight: 5,
633            }],
634        }
635    }
636
637    fn cloudflare_definition() -> VendorDefinition {
638        VendorDefinition {
639            id: VendorId::Cloudflare,
640            display_name: "Cloudflare".to_string(),
641            description: String::new(),
642            tier: 1,
643            signals: vec![VendorSignal {
644                pattern: "cf-ray".to_string(),
645                source: EvidenceSource::Header,
646                weight: 5,
647            }],
648        }
649    }
650
651    fn empty_classifier() -> VendorClassifier {
652        VendorClassifier::new(Vec::new())
653    }
654
655    #[test]
656    fn empty_classifier_reports_unknown() {
657        let classification =
658            empty_classifier().classify(&[], &BTreeMap::new(), None, "https://example.com/");
659        assert_eq!(classification.top_vendor, VendorId::Unknown);
660        assert!(classification.is_unknown());
661        assert!(!classification.is_high_confidence);
662        assert!(classification.evidence.is_empty());
663        assert!(classification.ranked.is_empty());
664    }
665
666    #[test]
667    fn single_vendor_match_with_one_signal_above_threshold() {
668        let classifier = VendorClassifier::new(vec![datadome_definition()]).with_threshold(0.60);
669        let mut headers = BTreeMap::new();
670        headers.insert("x-datadome".to_string(), "protected".to_string());
671        let classification = classifier.classify(&[], &headers, None, "https://example.com/");
672        assert_eq!(classification.top_vendor, VendorId::DataDome);
673        assert!((classification.confidence - 1.0).abs() < 1e-9);
674        assert!(classification.is_high_confidence);
675        assert_eq!(classification.evidence.items.len(), 1);
676        assert_eq!(
677            classification.evidence.items[0].source,
678            EvidenceSource::Header
679        );
680    }
681
682    #[test]
683    fn multi_vendor_match_ranks_by_score_with_deterministic_tie_break() {
684        let classifier =
685            VendorClassifier::new(vec![datadome_definition(), cloudflare_definition()]);
686        let mut headers = BTreeMap::new();
687        // Both vendors score 5 from their respective signals.
688        headers.insert("x-datadome".to_string(), "1".to_string());
689        headers.insert("cf-ray".to_string(), "1".to_string());
690        let classification = classifier.classify(&[], &headers, None, "https://example.com/");
691        // Tie-break: Akamai (0) < Cloudflare (1) < DataDome (2) < PerimeterX (3).
692        // We have Cloudflare (1) and DataDome (2) tied at 5; DataDome is
693        // declared later in the registry *and* has a higher discriminant,
694        // so Cloudflare wins on the VendorId order tie-break.
695        assert_eq!(classification.top_vendor, VendorId::Cloudflare);
696        // Confidence = top / (top + second) = 5 / (5 + 5) = 0.5
697        assert!((classification.confidence - 0.5).abs() < 1e-9);
698        assert!(!classification.is_high_confidence);
699    }
700
701    #[test]
702    fn below_threshold_classification_is_not_high_confidence() {
703        let classifier = VendorClassifier::new(vec![datadome_definition()]).with_threshold(0.99);
704        let mut headers = BTreeMap::new();
705        headers.insert("x-datadome".to_string(), "1".to_string());
706        let classification = classifier.classify(&[], &headers, None, "https://example.com/");
707        // Single-vendor match still has confidence 1.0, so the
708        // only way to push it below threshold is via a multi-
709        // vendor split.
710        let two = VendorClassifier::new(vec![datadome_definition(), cloudflare_definition()])
711            .with_threshold(0.99);
712        let mut headers2 = BTreeMap::new();
713        headers2.insert("x-datadome".to_string(), "1".to_string());
714        headers2.insert("cf-ray".to_string(), "1".to_string());
715        let c2 = two.classify(&[], &headers2, None, "https://example.com/");
716        assert!(!c2.is_high_confidence);
717        // Sanity-check the value.
718        let _ = classification;
719    }
720
721    #[test]
722    fn cookies_are_extracted_from_set_cookie_header() {
723        let classifier = VendorClassifier::new(vec![VendorDefinition {
724            id: VendorId::DataDome,
725            display_name: "x".to_string(),
726            description: String::new(),
727            tier: 1,
728            signals: vec![VendorSignal {
729                pattern: "datadome=".to_string(),
730                source: EvidenceSource::Cookie,
731                weight: 5,
732            }],
733        }]);
734        // The classifier accepts a `cookies: &[String]` parameter
735        // directly; `classify_view` is the convenience wrapper
736        // that pulls cookies out of the `set-cookie` header.
737        let cookies = vec!["datadome=abc; Path=/".to_string()];
738        let classification =
739            classifier.classify(&cookies, &BTreeMap::new(), None, "https://example.com/");
740        assert_eq!(classification.top_vendor, VendorId::DataDome);
741        assert_eq!(classification.evidence.items.len(), 1);
742        assert_eq!(
743            classification.evidence.items[0].source,
744            EvidenceSource::Cookie
745        );
746    }
747
748    #[test]
749    fn classify_view_extracts_cookies_from_set_cookie_header() {
750        let classifier = VendorClassifier::new(vec![VendorDefinition {
751            id: VendorId::DataDome,
752            display_name: "x".to_string(),
753            description: String::new(),
754            tier: 1,
755            signals: vec![VendorSignal {
756                pattern: "datadome=".to_string(),
757                source: EvidenceSource::Cookie,
758                weight: 5,
759            }],
760        }]);
761        let mut headers = BTreeMap::new();
762        headers.insert("set-cookie".to_string(), "datadome=abc; Path=/".to_string());
763        let tx = TransactionView {
764            url: "https://example.com/".to_string(),
765            status: 403,
766            response_headers: headers,
767            response_body_snippet: None,
768        };
769        let classification = classifier.classify_view(&tx);
770        assert_eq!(classification.top_vendor, VendorId::DataDome);
771        assert_eq!(
772            classification.evidence.items[0].source,
773            EvidenceSource::Cookie
774        );
775    }
776
777    #[test]
778    fn body_markers_match_case_insensitively() {
779        let classifier = VendorClassifier::new(vec![VendorDefinition {
780            id: VendorId::Cloudflare,
781            display_name: "x".to_string(),
782            description: String::new(),
783            tier: 1,
784            signals: vec![VendorSignal {
785                pattern: "attention required! | cloudflare".to_string(),
786                source: EvidenceSource::BodyMarker,
787                weight: 4,
788            }],
789        }]);
790        let body = "<h1>Attention Required! | Cloudflare</h1>";
791        let classification = classifier.classify(&[], &BTreeMap::new(), Some(body), "https://x/");
792        assert_eq!(classification.top_vendor, VendorId::Cloudflare);
793        assert_eq!(
794            classification.evidence.items[0].source,
795            EvidenceSource::BodyMarker
796        );
797    }
798
799    #[test]
800    fn challenge_url_signal_matches_path_segments() {
801        let classifier = VendorClassifier::new(vec![VendorDefinition {
802            id: VendorId::Cloudflare,
803            display_name: "x".to_string(),
804            description: String::new(),
805            tier: 1,
806            signals: vec![VendorSignal {
807                pattern: "cdn-cgi/challenge-platform".to_string(),
808                source: EvidenceSource::ChallengeUrl,
809                weight: 4,
810            }],
811        }]);
812        let url = "https://example.com/cdn-cgi/challenge-platform/orchestrate/jschl/abc";
813        let classification = classifier.classify(&[], &BTreeMap::new(), None, url);
814        assert_eq!(classification.top_vendor, VendorId::Cloudflare);
815        assert_eq!(
816            classification.evidence.items[0].source,
817            EvidenceSource::ChallengeUrl
818        );
819    }
820
821    #[test]
822    fn classify_view_pulls_inputs_from_transaction() {
823        let classifier = VendorClassifier::new(vec![datadome_definition()]);
824        let mut headers = BTreeMap::new();
825        headers.insert("x-datadome".to_string(), "1".to_string());
826        let tx = TransactionView {
827            url: "https://example.com/".to_string(),
828            status: 403,
829            response_headers: headers,
830            response_body_snippet: None,
831        };
832        let c = classifier.classify_view(&tx);
833        assert_eq!(c.top_vendor, VendorId::DataDome);
834    }
835
836    #[test]
837    fn threshold_validation_falls_back_to_default() {
838        let classifier = VendorClassifier::new(Vec::new()).with_threshold(f64::NAN);
839        assert!((classifier.threshold() - DEFAULT_HIGH_CONFIDENCE_THRESHOLD).abs() < 1e-9);
840        let negative = VendorClassifier::new(Vec::new()).with_threshold(-1.0);
841        assert!((negative.threshold() - DEFAULT_HIGH_CONFIDENCE_THRESHOLD).abs() < 1e-9);
842        let above = VendorClassifier::new(Vec::new()).with_threshold(1.5);
843        assert!((above.threshold() - DEFAULT_HIGH_CONFIDENCE_THRESHOLD).abs() < 1e-9);
844    }
845
846    #[test]
847    fn vendor_id_discriminant_order_breaks_ties() {
848        // The order of variants in the `VendorId` enum
849        // determines tie-break: Akamai (0) < Cloudflare (1) <
850        // DataDome (2) < PerimeterX (3).
851        let classifier = VendorClassifier::new(vec![
852            VendorDefinition {
853                id: VendorId::Akamai,
854                display_name: "x".to_string(),
855                description: String::new(),
856                tier: 1,
857                signals: vec![VendorSignal {
858                    pattern: "tied".to_string(),
859                    source: EvidenceSource::BodyMarker,
860                    weight: 5,
861                }],
862            },
863            VendorDefinition {
864                id: VendorId::PerimeterX,
865                display_name: "x".to_string(),
866                description: String::new(),
867                tier: 1,
868                signals: vec![VendorSignal {
869                    pattern: "tied".to_string(),
870                    source: EvidenceSource::BodyMarker,
871                    weight: 5,
872                }],
873            },
874        ]);
875        let body = "this body contains the tied marker";
876        let c = classifier.classify(&[], &BTreeMap::new(), Some(body), "https://x/");
877        // Both score 5; lower VendorId discriminant wins.
878        assert_eq!(c.top_vendor, VendorId::Akamai);
879    }
880
881    #[test]
882    fn builtin_classifier_includes_all_tier1_vendors() {
883        let classifier = VendorClassifier::with_builtin_defaults();
884        assert!(classifier.contains(VendorId::DataDome));
885        assert!(classifier.contains(VendorId::PerimeterX));
886        assert!(classifier.contains(VendorId::Akamai));
887        assert!(classifier.contains(VendorId::Cloudflare));
888    }
889
890    #[test]
891    fn builtin_classifier_detects_cloudflare_in_realistic_input() {
892        let classifier = VendorClassifier::with_builtin_defaults();
893        let mut headers = BTreeMap::new();
894        headers.insert("cf-ray".to_string(), "abc-ORD".to_string());
895        headers.insert("server".to_string(), "cloudflare".to_string());
896        let cookies = vec!["__cf_bm=xyz; path=/".to_string()];
897        let body = "Attention required! | cloudflare";
898        let url = "https://example.com/cdn-cgi/challenge-platform/orchestrate";
899        let c = classifier.classify(&cookies, &headers, Some(body), url);
900        assert_eq!(c.top_vendor, VendorId::Cloudflare);
901        assert!(c.is_high_confidence);
902        assert!(c.confidence > 0.0);
903        // Per-source summary should record at least one of each source.
904        assert!(
905            c.evidence
906                .source_summary
907                .contains_key(&EvidenceSource::Header)
908        );
909        assert!(
910            c.evidence
911                .source_summary
912                .contains_key(&EvidenceSource::Cookie)
913        );
914        assert!(
915            c.evidence
916                .source_summary
917                .contains_key(&EvidenceSource::BodyMarker)
918        );
919        assert!(
920            c.evidence
921                .source_summary
922                .contains_key(&EvidenceSource::ChallengeUrl)
923        );
924    }
925}