stygian_charon/vendor_classifier/vendor.rs
1//! Vendor taxonomy and TOML-loadable definitions (T89).
2//!
3//! The [`VendorId`] enum is the **stable, wire-level identifier**
4//! for every anti-bot vendor the classifier knows about. Adding a
5//! new variant is a breaking change for downstream consumers
6//! (e.g. `VendorClassification` JSON payloads), so the taxonomy is
7//! intentionally small and uses `#[serde(rename_all = "snake_case")]`
8//! for predictable wire labels.
9//!
10//! ## Tier 1 (always shipped)
11//!
12//! The four Tier 1 vendors are documented in
13//! `crates/stygian-charon/data/vendors/` and embedded into the
14//! binary at compile time via `include_str!`. Their TOML payload
15//! is the single source of truth for the per-vendor signal
16//! catalogue; the enum below is the wire/lookup contract.
17//!
18//! | `VendorId` | Display name | TOML file |
19//! |----------------|-----------------------------|----------------------------------|
20//! | `DataDome` | `DataDome` | `data/vendors/datadome.toml` |
21//! | `PerimeterX` | `PerimeterX` / HUMAN Security | `data/vendors/perimeter_x.toml` |
22//! | `Akamai` | `Akamai` Bot Manager | `data/vendors/akamai.toml` |
23//! | `Cloudflare` | `Cloudflare` | `data/vendors/cloudflare.toml` |
24//!
25//! ## Tier 2 (taxonomy-only, no baseline signals)
26//!
27//! `Hcaptcha`, `Recaptcha`, `Kasada`, `FingerprintCom`,
28//! `ShapeSecurity`, and `Imperva` are present in the enum so
29//! downstream T88/T90 layers can name them, but no baseline
30//! signals ship for them — operators must register their own
31//! signal catalogue via
32//! [`VendorDefinition`][crate::vendor_classifier::VendorDefinition].
33//!
34//! ## Unknown
35//!
36//! `Unknown` is the catch-all variant used when no vendor matched
37//! or when no classification can be produced. It must remain the
38//! **last** variant so it sorts last in the
39//! deterministic tie-break rule (see
40//! [`crate::vendor_classifier::VendorClassification`]).
41
42use std::collections::BTreeMap;
43
44use serde::{Deserialize, Serialize};
45
46use crate::vendor_classifier::error::VendorError;
47use crate::vendor_classifier::evidence::EvidenceSource;
48
49/// Stable identifier for an anti-bot vendor.
50///
51/// The discriminant order is **significant**: it is the
52/// deterministic tie-break rule for the classifier. When two
53/// vendors tie on the top score, the lower discriminant
54/// (`Akamai` < `Cloudflare` < `DataDome` < `PerimeterX` < …)
55/// wins.
56///
57/// # Example
58///
59/// ```
60/// use stygian_charon::vendor_classifier::VendorId;
61///
62/// let v = VendorId::DataDome;
63/// assert_eq!(v.label(), "datadome");
64/// assert_eq!(v.tier(), 1);
65/// ```
66#[derive(
67 Debug, Default, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize,
68)]
69#[serde(rename_all = "snake_case")]
70pub enum VendorId {
71 /// `Akamai` Bot Manager (`_abck`, `bm_sz`).
72 Akamai,
73 /// `Cloudflare` bot management (`cf-ray`, `__cf_bm`).
74 Cloudflare,
75 /// `DataDome` (`datadome=`, `x-datadome`).
76 DataDome,
77 /// `PerimeterX` / HUMAN Security (`_px3`, `_px2`).
78 PerimeterX,
79 /// hCaptcha challenge provider.
80 Hcaptcha,
81 /// Google reCAPTCHA challenge provider.
82 Recaptcha,
83 /// Kasada challenge provider.
84 Kasada,
85 /// Fingerprint.com identification.
86 FingerprintCom,
87 /// Shape Security (F5).
88 ShapeSecurity,
89 /// Imperva (Incapsula) bot management.
90 Imperva,
91 /// Catch-all when no vendor matched.
92 #[default]
93 Unknown,
94}
95
96impl VendorId {
97 /// Stable, lower-case wire label.
98 ///
99 /// # Example
100 ///
101 /// ```
102 /// use stygian_charon::vendor_classifier::VendorId;
103 ///
104 /// assert_eq!(VendorId::DataDome.label(), "datadome");
105 /// assert_eq!(VendorId::PerimeterX.label(), "perimeter_x");
106 /// assert_eq!(VendorId::Cloudflare.label(), "cloudflare");
107 /// assert_eq!(VendorId::Akamai.label(), "akamai");
108 /// ```
109 #[must_use]
110 pub const fn label(self) -> &'static str {
111 match self {
112 Self::Akamai => "akamai",
113 Self::Cloudflare => "cloudflare",
114 Self::DataDome => "datadome",
115 Self::PerimeterX => "perimeter_x",
116 Self::Hcaptcha => "hcaptcha",
117 Self::Recaptcha => "recaptcha",
118 Self::Kasada => "kasada",
119 Self::FingerprintCom => "fingerprint_com",
120 Self::ShapeSecurity => "shape_security",
121 Self::Imperva => "imperva",
122 Self::Unknown => "unknown",
123 }
124 }
125
126 /// Tier number (1 = always shipped, 2 = taxonomy-only, 0 = unknown).
127 ///
128 /// # Example
129 ///
130 /// ```
131 /// use stygian_charon::vendor_classifier::VendorId;
132 ///
133 /// assert_eq!(VendorId::DataDome.tier(), 1);
134 /// assert_eq!(VendorId::Cloudflare.tier(), 1);
135 /// assert_eq!(VendorId::Akamai.tier(), 1);
136 /// assert_eq!(VendorId::PerimeterX.tier(), 1);
137 /// assert_eq!(VendorId::Unknown.tier(), 0);
138 /// ```
139 #[must_use]
140 pub const fn tier(self) -> u8 {
141 match self {
142 Self::DataDome | Self::PerimeterX | Self::Akamai | Self::Cloudflare => 1,
143 Self::Hcaptcha
144 | Self::Recaptcha
145 | Self::Kasada
146 | Self::FingerprintCom
147 | Self::ShapeSecurity
148 | Self::Imperva => 2,
149 Self::Unknown => 0,
150 }
151 }
152
153 /// Parse a [`VendorId`] from its [`label`][Self::label].
154 ///
155 /// # Example
156 ///
157 /// ```
158 /// use stygian_charon::vendor_classifier::VendorId;
159 ///
160 /// assert_eq!(VendorId::from_label("datadome"), Some(VendorId::DataDome));
161 /// assert_eq!(VendorId::from_label("cloudflare"), Some(VendorId::Cloudflare));
162 /// assert_eq!(VendorId::from_label("nope"), None);
163 /// ```
164 #[must_use]
165 pub fn from_label(label: &str) -> Option<Self> {
166 match label {
167 "akamai" => Some(Self::Akamai),
168 "cloudflare" => Some(Self::Cloudflare),
169 "datadome" => Some(Self::DataDome),
170 "perimeter_x" => Some(Self::PerimeterX),
171 "hcaptcha" => Some(Self::Hcaptcha),
172 "recaptcha" => Some(Self::Recaptcha),
173 "kasada" => Some(Self::Kasada),
174 "fingerprint_com" => Some(Self::FingerprintCom),
175 "shape_security" => Some(Self::ShapeSecurity),
176 "imperva" => Some(Self::Imperva),
177 "unknown" => Some(Self::Unknown),
178 _ => None,
179 }
180 }
181}
182
183/// One signal row from a vendor definition's `[[signals]]` table.
184///
185/// A signal is the smallest unit the classifier matches against the
186/// input strings (cookies, headers, challenge URLs, body markers,
187/// scripts). Patterns are matched **case-insensitively** — the
188/// loader lower-cases them at load time so the per-request
189/// classification hot path never has to.
190///
191/// # Example
192///
193/// ```
194/// use stygian_charon::vendor_classifier::{EvidenceSource, VendorSignal};
195///
196/// let s = VendorSignal {
197/// pattern: "x-datadome".to_string(),
198/// source: EvidenceSource::Header,
199/// weight: 5,
200/// };
201/// assert_eq!(s.weight, 5);
202/// ```
203#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
204pub struct VendorSignal {
205 /// Literal pattern to search for (case-insensitive).
206 pub pattern: String,
207 /// Which input channel the pattern is matched against.
208 pub source: EvidenceSource,
209 /// Weight contributed to the vendor score on a hit.
210 pub weight: u32,
211}
212
213/// One vendor's signal catalogue. Multiple vendors can ship
214/// definitions; the [`crate::vendor_classifier::VendorClassifier`]
215/// consumes them all and ranks the matches.
216///
217/// Definitions are loaded from TOML at compile time via
218/// `include_str!`. The schema is
219/// `serde::Deserialize` so the same TOML files double as the
220/// operator-facing configuration surface.
221///
222/// # Example
223///
224/// ```
225/// use stygian_charon::vendor_classifier::{VendorDefinition, VendorId, VendorSignal, EvidenceSource};
226///
227/// let def = VendorDefinition {
228/// id: VendorId::DataDome,
229/// display_name: "DataDome".to_string(),
230/// description: "baseline".to_string(),
231/// tier: 1,
232/// signals: vec![VendorSignal {
233/// pattern: "x-datadome".to_string(),
234/// source: EvidenceSource::Header,
235/// weight: 5,
236/// }],
237/// };
238/// assert!(def.validate().is_ok());
239/// ```
240#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
241pub struct VendorDefinition {
242 /// Vendor identifier from the [`VendorId`] enum.
243 pub id: VendorId,
244 /// Human-readable display name (used in operator logs).
245 pub display_name: String,
246 /// Short description of the vendor stack.
247 #[serde(default)]
248 pub description: String,
249 /// Tier (1 = always shipped, 2 = taxonomy-only).
250 pub tier: u8,
251 /// Signal catalogue.
252 #[serde(default)]
253 pub signals: Vec<VendorSignal>,
254}
255
256impl VendorDefinition {
257 /// Validate the definition's internal consistency.
258 ///
259 /// # Errors
260 ///
261 /// Returns [`VendorError`] on the first inconsistency. The
262 /// error embeds the field path and the bad value so operators
263 /// can locate the offending TOML line without re-running the
264 /// loader.
265 pub fn validate(&self) -> Result<(), VendorError> {
266 if self.display_name.trim().is_empty() {
267 return Err(VendorError::invalid_field(
268 self.id.label(),
269 "display_name",
270 self.display_name.clone(),
271 "display_name must be a non-empty string",
272 ));
273 }
274 if !(0..=2).contains(&self.tier) {
275 return Err(VendorError::invalid_field(
276 self.id.label(),
277 "tier",
278 self.tier,
279 "tier must be 0 (unknown), 1 (baseline), or 2 (taxonomy-only)",
280 ));
281 }
282 for (i, sig) in self.signals.iter().enumerate() {
283 if sig.pattern.trim().is_empty() {
284 return Err(VendorError::invalid_field(
285 self.id.label(),
286 format!("signals[{i}].pattern"),
287 sig.pattern.clone(),
288 "pattern must be a non-empty string",
289 ));
290 }
291 if sig.weight == 0 {
292 return Err(VendorError::invalid_field(
293 self.id.label(),
294 format!("signals[{i}].weight"),
295 sig.weight,
296 "weight must be > 0",
297 ));
298 }
299 }
300 Ok(())
301 }
302
303 /// Return the signals, indexed by [`EvidenceSource`] for fast
304 /// classification.
305 #[must_use]
306 pub fn signals_by_source(&self) -> BTreeMap<EvidenceSource, Vec<&VendorSignal>> {
307 let mut grouped: BTreeMap<EvidenceSource, Vec<&VendorSignal>> = BTreeMap::new();
308 for sig in &self.signals {
309 grouped.entry(sig.source).or_default().push(sig);
310 }
311 grouped
312 }
313}
314
315/// Parse a raw TOML payload into a [`VendorDefinition`].
316///
317/// The TOML is expected to declare the `id` field as the lower-case
318/// `VendorId` label (e.g. `"datadome"`). The loader maps that label
319/// into a [`VendorId`] discriminant and rejects unknown ids with
320/// [`VendorError::UnknownVendorId`].
321///
322/// # Errors
323///
324/// Returns [`VendorError`] when the TOML fails to parse, the
325/// declared id is not part of the supported taxonomy, or the
326/// resulting [`VendorDefinition`] fails [`validate`][VendorDefinition::validate].
327pub fn parse_vendor_definition(toml_text: &str) -> Result<VendorDefinition, VendorError> {
328 #[derive(Deserialize)]
329 struct RawDefinition {
330 id: String,
331 display_name: String,
332 #[serde(default)]
333 description: String,
334 #[serde(default = "default_tier")]
335 tier: u8,
336 #[serde(default)]
337 signals: Vec<VendorSignal>,
338 }
339
340 let raw: RawDefinition = toml::from_str(toml_text)?;
341 let id = VendorId::from_label(&raw.id).ok_or_else(|| VendorError::UnknownVendorId {
342 vendor_id: raw.id.clone(),
343 })?;
344 let def = VendorDefinition {
345 id,
346 display_name: raw.display_name,
347 description: raw.description,
348 tier: raw.tier,
349 signals: raw
350 .signals
351 .into_iter()
352 .map(|mut s| {
353 s.pattern = s.pattern.to_ascii_lowercase();
354 s
355 })
356 .collect(),
357 };
358 def.validate()?;
359 Ok(def)
360}
361
362const fn default_tier() -> u8 {
363 1
364}
365
366#[cfg(test)]
367#[allow(
368 clippy::unwrap_used,
369 clippy::expect_used,
370 clippy::panic,
371 clippy::indexing_slicing
372)]
373mod tests {
374 use super::*;
375
376 #[test]
377 fn vendor_id_labels_round_trip() {
378 for v in [
379 VendorId::Akamai,
380 VendorId::Cloudflare,
381 VendorId::DataDome,
382 VendorId::PerimeterX,
383 VendorId::Hcaptcha,
384 VendorId::Recaptcha,
385 VendorId::Kasada,
386 VendorId::FingerprintCom,
387 VendorId::ShapeSecurity,
388 VendorId::Imperva,
389 VendorId::Unknown,
390 ] {
391 assert_eq!(VendorId::from_label(v.label()), Some(v));
392 }
393 }
394
395 #[test]
396 fn vendor_id_unknown_label_returns_none() {
397 assert_eq!(VendorId::from_label("nope"), None);
398 assert_eq!(VendorId::from_label(""), None);
399 assert_eq!(VendorId::from_label("DataDome"), None); // case-sensitive
400 }
401
402 #[test]
403 fn vendor_id_tier_matches_taxonomy_table() {
404 assert_eq!(VendorId::DataDome.tier(), 1);
405 assert_eq!(VendorId::PerimeterX.tier(), 1);
406 assert_eq!(VendorId::Akamai.tier(), 1);
407 assert_eq!(VendorId::Cloudflare.tier(), 1);
408 assert_eq!(VendorId::Hcaptcha.tier(), 2);
409 assert_eq!(VendorId::Recaptcha.tier(), 2);
410 assert_eq!(VendorId::Unknown.tier(), 0);
411 }
412
413 #[test]
414 fn definition_rejects_empty_display_name() {
415 let def = VendorDefinition {
416 id: VendorId::DataDome,
417 display_name: String::new(),
418 description: String::new(),
419 tier: 1,
420 signals: Vec::new(),
421 };
422 let err = def.validate().expect_err("empty display_name");
423 assert_eq!(err.field_path(), Some("display_name"));
424 }
425
426 #[test]
427 fn definition_rejects_out_of_range_tier() {
428 let def = VendorDefinition {
429 id: VendorId::DataDome,
430 display_name: "x".to_string(),
431 description: String::new(),
432 tier: 9,
433 signals: Vec::new(),
434 };
435 let err = def.validate().expect_err("bad tier");
436 assert_eq!(err.field_path(), Some("tier"));
437 }
438
439 #[test]
440 fn definition_rejects_empty_pattern() {
441 let def = VendorDefinition {
442 id: VendorId::DataDome,
443 display_name: "x".to_string(),
444 description: String::new(),
445 tier: 1,
446 signals: vec![VendorSignal {
447 pattern: String::new(),
448 source: EvidenceSource::Header,
449 weight: 5,
450 }],
451 };
452 let err = def.validate().expect_err("empty pattern");
453 assert!(err.field_path().is_some_and(|p| p.contains("signals[0]")));
454 }
455
456 #[test]
457 fn definition_rejects_zero_weight() {
458 let def = VendorDefinition {
459 id: VendorId::DataDome,
460 display_name: "x".to_string(),
461 description: String::new(),
462 tier: 1,
463 signals: vec![VendorSignal {
464 pattern: "x".to_string(),
465 source: EvidenceSource::Header,
466 weight: 0,
467 }],
468 };
469 let err = def.validate().expect_err("zero weight");
470 assert!(err.field_path().is_some_and(|p| p.contains("signals[0]")));
471 }
472
473 #[test]
474 fn parse_vendor_definition_round_trips_through_toml() {
475 let toml_text = r#"
476id = "datadome"
477display_name = "DataDome"
478description = "test"
479tier = 1
480
481[[signals]]
482pattern = "X-DATADOME"
483source = "header"
484weight = 5
485"#;
486 let def = parse_vendor_definition(toml_text).expect("parse");
487 assert_eq!(def.id, VendorId::DataDome);
488 assert_eq!(def.tier, 1);
489 // Patterns are case-folded at load time.
490 assert_eq!(def.signals[0].pattern, "x-datadome");
491 }
492
493 #[test]
494 fn parse_vendor_definition_rejects_unknown_id() {
495 let toml_text = r#"
496id = "nope"
497display_name = "Nope"
498tier = 1
499"#;
500 let err = parse_vendor_definition(toml_text).expect_err("unknown id");
501 assert!(matches!(err, VendorError::UnknownVendorId { .. }));
502 }
503
504 #[test]
505 fn signals_by_source_groups_correctly() {
506 let def = VendorDefinition {
507 id: VendorId::DataDome,
508 display_name: "x".to_string(),
509 description: String::new(),
510 tier: 1,
511 signals: vec![
512 VendorSignal {
513 pattern: "a".to_string(),
514 source: EvidenceSource::Header,
515 weight: 1,
516 },
517 VendorSignal {
518 pattern: "b".to_string(),
519 source: EvidenceSource::Header,
520 weight: 2,
521 },
522 VendorSignal {
523 pattern: "c".to_string(),
524 source: EvidenceSource::Cookie,
525 weight: 3,
526 },
527 ],
528 };
529 let grouped = def.signals_by_source();
530 assert_eq!(grouped.get(&EvidenceSource::Header).map(Vec::len), Some(2));
531 assert_eq!(grouped.get(&EvidenceSource::Cookie).map(Vec::len), Some(1));
532 assert_eq!(grouped.get(&EvidenceSource::BodyMarker).map(Vec::len), None);
533 }
534}