Skip to main content

stygian_charon/vendor_classifier/
vendor.rs

1//! Vendor taxonomy and TOML-loadable definitions (T89).
2//!
3//! The [`VendorId`] enum is the **stable, wire-level identifier**
4//! for every anti-bot vendor the classifier knows about. Adding a
5//! new variant is a breaking change for downstream consumers
6//! (e.g. `VendorClassification` JSON payloads), so the taxonomy is
7//! intentionally small and uses `#[serde(rename_all = "snake_case")]`
8//! for predictable wire labels.
9//!
10//! ## Tier 1 (always shipped)
11//!
12//! The four Tier 1 vendors are documented in
13//! `crates/stygian-charon/data/vendors/` and embedded into the
14//! binary at compile time via `include_str!`. Their TOML payload
15//! is the single source of truth for the per-vendor signal
16//! catalogue; the enum below is the wire/lookup contract.
17//!
18//! | `VendorId`     | Display name                | TOML file                        |
19//! |----------------|-----------------------------|----------------------------------|
20//! | `DataDome`     | `DataDome`                  | `data/vendors/datadome.toml`     |
21//! | `PerimeterX`   | `PerimeterX` / HUMAN Security | `data/vendors/perimeter_x.toml`  |
22//! | `Akamai`       | `Akamai` Bot Manager        | `data/vendors/akamai.toml`       |
23//! | `Cloudflare`   | `Cloudflare`                | `data/vendors/cloudflare.toml`   |
24//!
25//! ## Tier 2 (taxonomy-only, no baseline signals)
26//!
27//! `Hcaptcha`, `Recaptcha`, `Kasada`, `FingerprintCom`,
28//! `ShapeSecurity`, and `Imperva` are present in the enum so
29//! downstream T88/T90 layers can name them, but no baseline
30//! signals ship for them — operators must register their own
31//! signal catalogue via
32//! [`VendorDefinition`][crate::vendor_classifier::VendorDefinition].
33//!
34//! ## Unknown
35//!
36//! `Unknown` is the catch-all variant used when no vendor matched
37//! or when no classification can be produced. It must remain the
38//! **last** variant so it sorts last in the
39//! deterministic tie-break rule (see
40//! [`crate::vendor_classifier::VendorClassification`]).
41
42use std::collections::BTreeMap;
43
44use serde::{Deserialize, Serialize};
45
46use crate::vendor_classifier::error::VendorError;
47use crate::vendor_classifier::evidence::EvidenceSource;
48
49/// Stable identifier for an anti-bot vendor.
50///
51/// The discriminant order is **significant**: it is the
52/// deterministic tie-break rule for the classifier. When two
53/// vendors tie on the top score, the lower discriminant
54/// (`Akamai` < `Cloudflare` < `DataDome` < `PerimeterX` < …)
55/// wins.
56///
57/// # Example
58///
59/// ```
60/// use stygian_charon::vendor_classifier::VendorId;
61///
62/// let v = VendorId::DataDome;
63/// assert_eq!(v.label(), "datadome");
64/// assert_eq!(v.tier(), 1);
65/// ```
66#[derive(
67    Debug, Default, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize,
68)]
69#[serde(rename_all = "snake_case")]
70pub enum VendorId {
71    /// `Akamai` Bot Manager (`_abck`, `bm_sz`).
72    Akamai,
73    /// `Cloudflare` bot management (`cf-ray`, `__cf_bm`).
74    Cloudflare,
75    /// `DataDome` (`datadome=`, `x-datadome`).
76    DataDome,
77    /// `PerimeterX` / HUMAN Security (`_px3`, `_px2`).
78    PerimeterX,
79    /// hCaptcha challenge provider.
80    Hcaptcha,
81    /// Google reCAPTCHA challenge provider.
82    Recaptcha,
83    /// Kasada challenge provider.
84    Kasada,
85    /// Fingerprint.com identification.
86    FingerprintCom,
87    /// Shape Security (F5).
88    ShapeSecurity,
89    /// Imperva (Incapsula) bot management.
90    Imperva,
91    /// Catch-all when no vendor matched.
92    #[default]
93    Unknown,
94}
95
96impl VendorId {
97    /// Stable, lower-case wire label.
98    ///
99    /// # Example
100    ///
101    /// ```
102    /// use stygian_charon::vendor_classifier::VendorId;
103    ///
104    /// assert_eq!(VendorId::DataDome.label(), "datadome");
105    /// assert_eq!(VendorId::PerimeterX.label(), "perimeter_x");
106    /// assert_eq!(VendorId::Cloudflare.label(), "cloudflare");
107    /// assert_eq!(VendorId::Akamai.label(), "akamai");
108    /// ```
109    #[must_use]
110    pub const fn label(self) -> &'static str {
111        match self {
112            Self::Akamai => "akamai",
113            Self::Cloudflare => "cloudflare",
114            Self::DataDome => "datadome",
115            Self::PerimeterX => "perimeter_x",
116            Self::Hcaptcha => "hcaptcha",
117            Self::Recaptcha => "recaptcha",
118            Self::Kasada => "kasada",
119            Self::FingerprintCom => "fingerprint_com",
120            Self::ShapeSecurity => "shape_security",
121            Self::Imperva => "imperva",
122            Self::Unknown => "unknown",
123        }
124    }
125
126    /// Tier number (1 = always shipped, 2 = taxonomy-only, 0 = unknown).
127    ///
128    /// # Example
129    ///
130    /// ```
131    /// use stygian_charon::vendor_classifier::VendorId;
132    ///
133    /// assert_eq!(VendorId::DataDome.tier(), 1);
134    /// assert_eq!(VendorId::Cloudflare.tier(), 1);
135    /// assert_eq!(VendorId::Akamai.tier(), 1);
136    /// assert_eq!(VendorId::PerimeterX.tier(), 1);
137    /// assert_eq!(VendorId::Unknown.tier(), 0);
138    /// ```
139    #[must_use]
140    pub const fn tier(self) -> u8 {
141        match self {
142            Self::DataDome | Self::PerimeterX | Self::Akamai | Self::Cloudflare => 1,
143            Self::Hcaptcha
144            | Self::Recaptcha
145            | Self::Kasada
146            | Self::FingerprintCom
147            | Self::ShapeSecurity
148            | Self::Imperva => 2,
149            Self::Unknown => 0,
150        }
151    }
152
153    /// Parse a [`VendorId`] from its [`label`][Self::label].
154    ///
155    /// # Example
156    ///
157    /// ```
158    /// use stygian_charon::vendor_classifier::VendorId;
159    ///
160    /// assert_eq!(VendorId::from_label("datadome"), Some(VendorId::DataDome));
161    /// assert_eq!(VendorId::from_label("cloudflare"), Some(VendorId::Cloudflare));
162    /// assert_eq!(VendorId::from_label("nope"), None);
163    /// ```
164    #[must_use]
165    pub fn from_label(label: &str) -> Option<Self> {
166        match label {
167            "akamai" => Some(Self::Akamai),
168            "cloudflare" => Some(Self::Cloudflare),
169            "datadome" => Some(Self::DataDome),
170            "perimeter_x" => Some(Self::PerimeterX),
171            "hcaptcha" => Some(Self::Hcaptcha),
172            "recaptcha" => Some(Self::Recaptcha),
173            "kasada" => Some(Self::Kasada),
174            "fingerprint_com" => Some(Self::FingerprintCom),
175            "shape_security" => Some(Self::ShapeSecurity),
176            "imperva" => Some(Self::Imperva),
177            "unknown" => Some(Self::Unknown),
178            _ => None,
179        }
180    }
181}
182
183/// One signal row from a vendor definition's `[[signals]]` table.
184///
185/// A signal is the smallest unit the classifier matches against the
186/// input strings (cookies, headers, challenge URLs, body markers,
187/// scripts). Patterns are matched **case-insensitively** — the
188/// loader lower-cases them at load time so the per-request
189/// classification hot path never has to.
190///
191/// # Example
192///
193/// ```
194/// use stygian_charon::vendor_classifier::{EvidenceSource, VendorSignal};
195///
196/// let s = VendorSignal {
197///     pattern: "x-datadome".to_string(),
198///     source: EvidenceSource::Header,
199///     weight: 5,
200/// };
201/// assert_eq!(s.weight, 5);
202/// ```
203#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
204pub struct VendorSignal {
205    /// Literal pattern to search for (case-insensitive).
206    pub pattern: String,
207    /// Which input channel the pattern is matched against.
208    pub source: EvidenceSource,
209    /// Weight contributed to the vendor score on a hit.
210    pub weight: u32,
211}
212
213/// One vendor's signal catalogue. Multiple vendors can ship
214/// definitions; the [`crate::vendor_classifier::VendorClassifier`]
215/// consumes them all and ranks the matches.
216///
217/// Definitions are loaded from TOML at compile time via
218/// `include_str!`. The schema is
219/// `serde::Deserialize` so the same TOML files double as the
220/// operator-facing configuration surface.
221///
222/// # Example
223///
224/// ```
225/// use stygian_charon::vendor_classifier::{VendorDefinition, VendorId, VendorSignal, EvidenceSource};
226///
227/// let def = VendorDefinition {
228///     id: VendorId::DataDome,
229///     display_name: "DataDome".to_string(),
230///     description: "baseline".to_string(),
231///     tier: 1,
232///     signals: vec![VendorSignal {
233///         pattern: "x-datadome".to_string(),
234///         source: EvidenceSource::Header,
235///         weight: 5,
236///     }],
237/// };
238/// assert!(def.validate().is_ok());
239/// ```
240#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
241pub struct VendorDefinition {
242    /// Vendor identifier from the [`VendorId`] enum.
243    pub id: VendorId,
244    /// Human-readable display name (used in operator logs).
245    pub display_name: String,
246    /// Short description of the vendor stack.
247    #[serde(default)]
248    pub description: String,
249    /// Tier (1 = always shipped, 2 = taxonomy-only).
250    pub tier: u8,
251    /// Signal catalogue.
252    #[serde(default)]
253    pub signals: Vec<VendorSignal>,
254}
255
256impl VendorDefinition {
257    /// Validate the definition's internal consistency.
258    ///
259    /// # Errors
260    ///
261    /// Returns [`VendorError`] on the first inconsistency. The
262    /// error embeds the field path and the bad value so operators
263    /// can locate the offending TOML line without re-running the
264    /// loader.
265    pub fn validate(&self) -> Result<(), VendorError> {
266        if self.display_name.trim().is_empty() {
267            return Err(VendorError::invalid_field(
268                self.id.label(),
269                "display_name",
270                self.display_name.clone(),
271                "display_name must be a non-empty string",
272            ));
273        }
274        if !(0..=2).contains(&self.tier) {
275            return Err(VendorError::invalid_field(
276                self.id.label(),
277                "tier",
278                self.tier,
279                "tier must be 0 (unknown), 1 (baseline), or 2 (taxonomy-only)",
280            ));
281        }
282        for (i, sig) in self.signals.iter().enumerate() {
283            if sig.pattern.trim().is_empty() {
284                return Err(VendorError::invalid_field(
285                    self.id.label(),
286                    format!("signals[{i}].pattern"),
287                    sig.pattern.clone(),
288                    "pattern must be a non-empty string",
289                ));
290            }
291            if sig.weight == 0 {
292                return Err(VendorError::invalid_field(
293                    self.id.label(),
294                    format!("signals[{i}].weight"),
295                    sig.weight,
296                    "weight must be > 0",
297                ));
298            }
299        }
300        Ok(())
301    }
302
303    /// Return the signals, indexed by [`EvidenceSource`] for fast
304    /// classification.
305    #[must_use]
306    pub fn signals_by_source(&self) -> BTreeMap<EvidenceSource, Vec<&VendorSignal>> {
307        let mut grouped: BTreeMap<EvidenceSource, Vec<&VendorSignal>> = BTreeMap::new();
308        for sig in &self.signals {
309            grouped.entry(sig.source).or_default().push(sig);
310        }
311        grouped
312    }
313}
314
315/// Parse a raw TOML payload into a [`VendorDefinition`].
316///
317/// The TOML is expected to declare the `id` field as the lower-case
318/// `VendorId` label (e.g. `"datadome"`). The loader maps that label
319/// into a [`VendorId`] discriminant and rejects unknown ids with
320/// [`VendorError::UnknownVendorId`].
321///
322/// # Errors
323///
324/// Returns [`VendorError`] when the TOML fails to parse, the
325/// declared id is not part of the supported taxonomy, or the
326/// resulting [`VendorDefinition`] fails [`validate`][VendorDefinition::validate].
327pub fn parse_vendor_definition(toml_text: &str) -> Result<VendorDefinition, VendorError> {
328    #[derive(Deserialize)]
329    struct RawDefinition {
330        id: String,
331        display_name: String,
332        #[serde(default)]
333        description: String,
334        #[serde(default = "default_tier")]
335        tier: u8,
336        #[serde(default)]
337        signals: Vec<VendorSignal>,
338    }
339
340    let raw: RawDefinition = toml::from_str(toml_text)?;
341    let id = VendorId::from_label(&raw.id).ok_or_else(|| VendorError::UnknownVendorId {
342        vendor_id: raw.id.clone(),
343    })?;
344    let def = VendorDefinition {
345        id,
346        display_name: raw.display_name,
347        description: raw.description,
348        tier: raw.tier,
349        signals: raw
350            .signals
351            .into_iter()
352            .map(|mut s| {
353                s.pattern = s.pattern.to_ascii_lowercase();
354                s
355            })
356            .collect(),
357    };
358    def.validate()?;
359    Ok(def)
360}
361
362const fn default_tier() -> u8 {
363    1
364}
365
366#[cfg(test)]
367#[allow(
368    clippy::unwrap_used,
369    clippy::expect_used,
370    clippy::panic,
371    clippy::indexing_slicing
372)]
373mod tests {
374    use super::*;
375
376    #[test]
377    fn vendor_id_labels_round_trip() {
378        for v in [
379            VendorId::Akamai,
380            VendorId::Cloudflare,
381            VendorId::DataDome,
382            VendorId::PerimeterX,
383            VendorId::Hcaptcha,
384            VendorId::Recaptcha,
385            VendorId::Kasada,
386            VendorId::FingerprintCom,
387            VendorId::ShapeSecurity,
388            VendorId::Imperva,
389            VendorId::Unknown,
390        ] {
391            assert_eq!(VendorId::from_label(v.label()), Some(v));
392        }
393    }
394
395    #[test]
396    fn vendor_id_unknown_label_returns_none() {
397        assert_eq!(VendorId::from_label("nope"), None);
398        assert_eq!(VendorId::from_label(""), None);
399        assert_eq!(VendorId::from_label("DataDome"), None); // case-sensitive
400    }
401
402    #[test]
403    fn vendor_id_tier_matches_taxonomy_table() {
404        assert_eq!(VendorId::DataDome.tier(), 1);
405        assert_eq!(VendorId::PerimeterX.tier(), 1);
406        assert_eq!(VendorId::Akamai.tier(), 1);
407        assert_eq!(VendorId::Cloudflare.tier(), 1);
408        assert_eq!(VendorId::Hcaptcha.tier(), 2);
409        assert_eq!(VendorId::Recaptcha.tier(), 2);
410        assert_eq!(VendorId::Unknown.tier(), 0);
411    }
412
413    #[test]
414    fn definition_rejects_empty_display_name() {
415        let def = VendorDefinition {
416            id: VendorId::DataDome,
417            display_name: String::new(),
418            description: String::new(),
419            tier: 1,
420            signals: Vec::new(),
421        };
422        let err = def.validate().expect_err("empty display_name");
423        assert_eq!(err.field_path(), Some("display_name"));
424    }
425
426    #[test]
427    fn definition_rejects_out_of_range_tier() {
428        let def = VendorDefinition {
429            id: VendorId::DataDome,
430            display_name: "x".to_string(),
431            description: String::new(),
432            tier: 9,
433            signals: Vec::new(),
434        };
435        let err = def.validate().expect_err("bad tier");
436        assert_eq!(err.field_path(), Some("tier"));
437    }
438
439    #[test]
440    fn definition_rejects_empty_pattern() {
441        let def = VendorDefinition {
442            id: VendorId::DataDome,
443            display_name: "x".to_string(),
444            description: String::new(),
445            tier: 1,
446            signals: vec![VendorSignal {
447                pattern: String::new(),
448                source: EvidenceSource::Header,
449                weight: 5,
450            }],
451        };
452        let err = def.validate().expect_err("empty pattern");
453        assert!(err.field_path().is_some_and(|p| p.contains("signals[0]")));
454    }
455
456    #[test]
457    fn definition_rejects_zero_weight() {
458        let def = VendorDefinition {
459            id: VendorId::DataDome,
460            display_name: "x".to_string(),
461            description: String::new(),
462            tier: 1,
463            signals: vec![VendorSignal {
464                pattern: "x".to_string(),
465                source: EvidenceSource::Header,
466                weight: 0,
467            }],
468        };
469        let err = def.validate().expect_err("zero weight");
470        assert!(err.field_path().is_some_and(|p| p.contains("signals[0]")));
471    }
472
473    #[test]
474    fn parse_vendor_definition_round_trips_through_toml() {
475        let toml_text = r#"
476id = "datadome"
477display_name = "DataDome"
478description = "test"
479tier = 1
480
481[[signals]]
482pattern = "X-DATADOME"
483source = "header"
484weight = 5
485"#;
486        let def = parse_vendor_definition(toml_text).expect("parse");
487        assert_eq!(def.id, VendorId::DataDome);
488        assert_eq!(def.tier, 1);
489        // Patterns are case-folded at load time.
490        assert_eq!(def.signals[0].pattern, "x-datadome");
491    }
492
493    #[test]
494    fn parse_vendor_definition_rejects_unknown_id() {
495        let toml_text = r#"
496id = "nope"
497display_name = "Nope"
498tier = 1
499"#;
500        let err = parse_vendor_definition(toml_text).expect_err("unknown id");
501        assert!(matches!(err, VendorError::UnknownVendorId { .. }));
502    }
503
504    #[test]
505    fn signals_by_source_groups_correctly() {
506        let def = VendorDefinition {
507            id: VendorId::DataDome,
508            display_name: "x".to_string(),
509            description: String::new(),
510            tier: 1,
511            signals: vec![
512                VendorSignal {
513                    pattern: "a".to_string(),
514                    source: EvidenceSource::Header,
515                    weight: 1,
516                },
517                VendorSignal {
518                    pattern: "b".to_string(),
519                    source: EvidenceSource::Header,
520                    weight: 2,
521                },
522                VendorSignal {
523                    pattern: "c".to_string(),
524                    source: EvidenceSource::Cookie,
525                    weight: 3,
526                },
527            ],
528        };
529        let grouped = def.signals_by_source();
530        assert_eq!(grouped.get(&EvidenceSource::Header).map(Vec::len), Some(2));
531        assert_eq!(grouped.get(&EvidenceSource::Cookie).map(Vec::len), Some(1));
532        assert_eq!(grouped.get(&EvidenceSource::BodyMarker).map(Vec::len), None);
533    }
534}