stygian_charon/vendor_classifier/evidence.rs
1//! Vendor-classifier evidence types (T89).
2//!
3//! Every signal the [`VendorClassifier`][crate::vendor_classifier::VendorClassifier]
4//! observes is recorded as an [`Evidence`] item, labelled by its
5//! [`EvidenceSource`]. The bundle of matched evidence is returned
6//! alongside the ranked scores so diagnostics consumers can audit
7//! *why* the classifier picked a given vendor without re-running it.
8//!
9//! ## Determinism
10//!
11//! Signals are sorted by `(source, signal)` in lexicographic order
12//! before the score is computed. This keeps the confidence output
13//! stable across runs even when the input vectors are produced in
14//! different orders (a common pitfall when assembling
15//! cookie/header/body strings from independent matchers).
16
17use std::collections::BTreeMap;
18
19use serde::{Deserialize, Serialize};
20
21/// Where a single classifier signal came from.
22///
23/// The five variants are the documented input channels for
24/// [`crate::vendor_classifier::VendorClassifier`]:
25///
26/// | Source | Where it was found |
27/// |----------------|-----------------------------------------------------|
28/// | `Cookie` | A `Set-Cookie` response header or `Cookie` header. |
29/// | `Header` | Any other response header. |
30/// | `ChallengeUrl` | A challenge/redirect URL (request URL or `Location`).|
31/// | `BodyMarker` | A literal string in the response body snippet. |
32/// | `Script` | A literal in a `<script>` snippet (inline JS). |
33///
34/// The taxonomy is `#[serde(rename_all = "snake_case")]` so the
35/// wire form is stable across releases.
36///
37/// # Example
38///
39/// ```
40/// use stygian_charon::vendor_classifier::EvidenceSource;
41///
42/// let src = EvidenceSource::Cookie;
43/// assert_eq!(src.label(), "cookie");
44/// ```
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum EvidenceSource {
48 /// `Set-Cookie` or `Cookie` header.
49 Cookie,
50 /// Any non-cookie response header.
51 Header,
52 /// Challenge/redirect URL (request URL or `Location` header).
53 ChallengeUrl,
54 /// Literal string in the response body.
55 BodyMarker,
56 /// Literal in a `<script>` block (inline JS challenge).
57 Script,
58}
59
60impl EvidenceSource {
61 /// Stable, human-readable label.
62 ///
63 /// # Example
64 ///
65 /// ```
66 /// use stygian_charon::vendor_classifier::EvidenceSource;
67 ///
68 /// assert_eq!(EvidenceSource::Cookie.label(), "cookie");
69 /// assert_eq!(EvidenceSource::Header.label(), "header");
70 /// assert_eq!(EvidenceSource::ChallengeUrl.label(), "challenge_url");
71 /// assert_eq!(EvidenceSource::BodyMarker.label(), "body_marker");
72 /// assert_eq!(EvidenceSource::Script.label(), "script");
73 /// ```
74 #[must_use]
75 pub const fn label(self) -> &'static str {
76 match self {
77 Self::Cookie => "cookie",
78 Self::Header => "header",
79 Self::ChallengeUrl => "challenge_url",
80 Self::BodyMarker => "body_marker",
81 Self::Script => "script",
82 }
83 }
84}
85
86/// One matched signal in the evidence bundle.
87///
88/// An `Evidence` row is the **smallest auditable unit** the
89/// classifier emits. Each row carries:
90///
91/// - the literal `signal` text that matched,
92/// - the [`EvidenceSource`] it came from,
93/// - and the `weight` (sourced from the vendor definition) that
94/// the classifier added to the vendor's score for this match.
95///
96/// # Example
97///
98/// ```
99/// use stygian_charon::vendor_classifier::{Evidence, EvidenceSource};
100///
101/// let ev = Evidence {
102/// signal: "_abck=".to_string(),
103/// source: EvidenceSource::Cookie,
104/// weight: 5,
105/// };
106/// assert_eq!(ev.source, EvidenceSource::Cookie);
107/// assert_eq!(ev.weight, 5);
108/// ```
109#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
110pub struct Evidence {
111 /// The literal text that matched (case-folded, lower-cased).
112 pub signal: String,
113 /// Which input channel produced the match.
114 pub source: EvidenceSource,
115 /// Weight contributed to the vendor score (from the vendor
116 /// definition's `signals[*].weight`).
117 pub weight: u32,
118}
119
120/// Bundle of every [`Evidence`] item the classifier observed,
121/// plus a per-source count summary.
122///
123/// The bundle is **append-only**: the classifier never drops or
124/// re-orders evidence after the match phase.
125///
126/// The [`source_summary`][Self::source_summary] is a precomputed
127/// `BTreeMap` so consumers can render a compact "matched
128/// `n_cookies` cookie + `n_headers` header" summary without
129/// walking the evidence vector.
130///
131/// # Example
132///
133/// ```
134/// use stygian_charon::vendor_classifier::{Evidence, EvidenceBundle, EvidenceSource};
135///
136/// let bundle = EvidenceBundle {
137/// items: vec![Evidence {
138/// signal: "x-datadome".to_string(),
139/// source: EvidenceSource::Header,
140/// weight: 5,
141/// }],
142/// source_summary: vec![(EvidenceSource::Header, 1)].into_iter().collect(),
143/// };
144/// assert_eq!(bundle.items.len(), 1);
145/// assert_eq!(bundle.source_summary.get(&EvidenceSource::Header), Some(&1));
146/// ```
147#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
148pub struct EvidenceBundle {
149 /// Every evidence row the classifier observed, in match order.
150 pub items: Vec<Evidence>,
151 /// Precomputed per-source count summary.
152 pub source_summary: BTreeMap<EvidenceSource, usize>,
153}
154
155impl EvidenceBundle {
156 /// Total number of evidence items in the bundle.
157 #[must_use]
158 pub const fn len(&self) -> usize {
159 self.items.len()
160 }
161
162 /// `true` when the bundle is empty (no signals matched).
163 #[must_use]
164 pub const fn is_empty(&self) -> bool {
165 self.items.is_empty()
166 }
167
168 /// All evidence rows that came from a single source.
169 pub fn for_source(&self, source: EvidenceSource) -> impl Iterator<Item = &Evidence> {
170 self.items.iter().filter(move |e| e.source == source)
171 }
172}
173
174#[cfg(test)]
175#[allow(
176 clippy::unwrap_used,
177 clippy::expect_used,
178 clippy::panic,
179 clippy::indexing_slicing
180)]
181mod tests {
182 use super::*;
183
184 #[test]
185 fn evidence_source_labels_are_stable() {
186 assert_eq!(EvidenceSource::Cookie.label(), "cookie");
187 assert_eq!(EvidenceSource::Header.label(), "header");
188 assert_eq!(EvidenceSource::ChallengeUrl.label(), "challenge_url");
189 assert_eq!(EvidenceSource::BodyMarker.label(), "body_marker");
190 assert_eq!(EvidenceSource::Script.label(), "script");
191 }
192
193 #[test]
194 fn evidence_source_serde_round_trip_is_stable() {
195 for src in [
196 EvidenceSource::Cookie,
197 EvidenceSource::Header,
198 EvidenceSource::ChallengeUrl,
199 EvidenceSource::BodyMarker,
200 EvidenceSource::Script,
201 ] {
202 let json = serde_json::to_string(&src).expect("serialize");
203 let back: EvidenceSource = serde_json::from_str(&json).expect("deserialize");
204 assert_eq!(src, back);
205 assert_eq!(json, format!("\"{}\"", src.label()));
206 }
207 }
208
209 #[test]
210 fn evidence_bundle_filter_by_source_is_correct() {
211 let bundle = EvidenceBundle {
212 items: vec![
213 Evidence {
214 signal: "x-datadome".to_string(),
215 source: EvidenceSource::Header,
216 weight: 5,
217 },
218 Evidence {
219 signal: "datadome=".to_string(),
220 source: EvidenceSource::Cookie,
221 weight: 4,
222 },
223 Evidence {
224 signal: "cf-ray".to_string(),
225 source: EvidenceSource::Header,
226 weight: 5,
227 },
228 ],
229 source_summary: vec![(EvidenceSource::Header, 2), (EvidenceSource::Cookie, 1)]
230 .into_iter()
231 .collect(),
232 };
233 let headers = bundle.for_source(EvidenceSource::Header).count();
234 assert_eq!(headers, 2);
235 assert_eq!(bundle.len(), 3);
236 assert!(!bundle.is_empty());
237 }
238
239 #[test]
240 fn empty_bundle_is_empty() {
241 let bundle = EvidenceBundle::default();
242 assert_eq!(bundle.len(), 0);
243 assert!(bundle.is_empty());
244 }
245}