Skip to main content

stygian_charon/vendor_classifier/
evidence.rs

1//! Vendor-classifier evidence types (T89).
2//!
3//! Every signal the [`VendorClassifier`][crate::vendor_classifier::VendorClassifier]
4//! observes is recorded as an [`Evidence`] item, labelled by its
5//! [`EvidenceSource`]. The bundle of matched evidence is returned
6//! alongside the ranked scores so diagnostics consumers can audit
7//! *why* the classifier picked a given vendor without re-running it.
8//!
9//! ## Determinism
10//!
11//! Signals are sorted by `(source, signal)` in lexicographic order
12//! before the score is computed. This keeps the confidence output
13//! stable across runs even when the input vectors are produced in
14//! different orders (a common pitfall when assembling
15//! cookie/header/body strings from independent matchers).
16
17use std::collections::BTreeMap;
18
19use serde::{Deserialize, Serialize};
20
21/// Where a single classifier signal came from.
22///
23/// The five variants are the documented input channels for
24/// [`crate::vendor_classifier::VendorClassifier`]:
25///
26/// | Source         | Where it was found                                  |
27/// |----------------|-----------------------------------------------------|
28/// | `Cookie`       | A `Set-Cookie` response header or `Cookie` header. |
29/// | `Header`       | Any other response header.                          |
30/// | `ChallengeUrl` | A challenge/redirect URL (request URL or `Location`).|
31/// | `BodyMarker`   | A literal string in the response body snippet.      |
32/// | `Script`       | A literal in a `<script>` snippet (inline JS).      |
33///
34/// The taxonomy is `#[serde(rename_all = "snake_case")]` so the
35/// wire form is stable across releases.
36///
37/// # Example
38///
39/// ```
40/// use stygian_charon::vendor_classifier::EvidenceSource;
41///
42/// let src = EvidenceSource::Cookie;
43/// assert_eq!(src.label(), "cookie");
44/// ```
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum EvidenceSource {
48    /// `Set-Cookie` or `Cookie` header.
49    Cookie,
50    /// Any non-cookie response header.
51    Header,
52    /// Challenge/redirect URL (request URL or `Location` header).
53    ChallengeUrl,
54    /// Literal string in the response body.
55    BodyMarker,
56    /// Literal in a `<script>` block (inline JS challenge).
57    Script,
58}
59
60impl EvidenceSource {
61    /// Stable, human-readable label.
62    ///
63    /// # Example
64    ///
65    /// ```
66    /// use stygian_charon::vendor_classifier::EvidenceSource;
67    ///
68    /// assert_eq!(EvidenceSource::Cookie.label(), "cookie");
69    /// assert_eq!(EvidenceSource::Header.label(), "header");
70    /// assert_eq!(EvidenceSource::ChallengeUrl.label(), "challenge_url");
71    /// assert_eq!(EvidenceSource::BodyMarker.label(), "body_marker");
72    /// assert_eq!(EvidenceSource::Script.label(), "script");
73    /// ```
74    #[must_use]
75    pub const fn label(self) -> &'static str {
76        match self {
77            Self::Cookie => "cookie",
78            Self::Header => "header",
79            Self::ChallengeUrl => "challenge_url",
80            Self::BodyMarker => "body_marker",
81            Self::Script => "script",
82        }
83    }
84}
85
86/// One matched signal in the evidence bundle.
87///
88/// An `Evidence` row is the **smallest auditable unit** the
89/// classifier emits. Each row carries:
90///
91/// - the literal `signal` text that matched,
92/// - the [`EvidenceSource`] it came from,
93/// - and the `weight` (sourced from the vendor definition) that
94///   the classifier added to the vendor's score for this match.
95///
96/// # Example
97///
98/// ```
99/// use stygian_charon::vendor_classifier::{Evidence, EvidenceSource};
100///
101/// let ev = Evidence {
102///     signal: "_abck=".to_string(),
103///     source: EvidenceSource::Cookie,
104///     weight: 5,
105/// };
106/// assert_eq!(ev.source, EvidenceSource::Cookie);
107/// assert_eq!(ev.weight, 5);
108/// ```
109#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
110pub struct Evidence {
111    /// The literal text that matched (case-folded, lower-cased).
112    pub signal: String,
113    /// Which input channel produced the match.
114    pub source: EvidenceSource,
115    /// Weight contributed to the vendor score (from the vendor
116    /// definition's `signals[*].weight`).
117    pub weight: u32,
118}
119
120/// Bundle of every [`Evidence`] item the classifier observed,
121/// plus a per-source count summary.
122///
123/// The bundle is **append-only**: the classifier never drops or
124/// re-orders evidence after the match phase.
125///
126/// The [`source_summary`][Self::source_summary] is a precomputed
127/// `BTreeMap` so consumers can render a compact "matched
128/// `n_cookies` cookie + `n_headers` header" summary without
129/// walking the evidence vector.
130///
131/// # Example
132///
133/// ```
134/// use stygian_charon::vendor_classifier::{Evidence, EvidenceBundle, EvidenceSource};
135///
136/// let bundle = EvidenceBundle {
137///     items: vec![Evidence {
138///         signal: "x-datadome".to_string(),
139///         source: EvidenceSource::Header,
140///         weight: 5,
141///     }],
142///     source_summary: vec![(EvidenceSource::Header, 1)].into_iter().collect(),
143/// };
144/// assert_eq!(bundle.items.len(), 1);
145/// assert_eq!(bundle.source_summary.get(&EvidenceSource::Header), Some(&1));
146/// ```
147#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
148pub struct EvidenceBundle {
149    /// Every evidence row the classifier observed, in match order.
150    pub items: Vec<Evidence>,
151    /// Precomputed per-source count summary.
152    pub source_summary: BTreeMap<EvidenceSource, usize>,
153}
154
155impl EvidenceBundle {
156    /// Total number of evidence items in the bundle.
157    #[must_use]
158    pub const fn len(&self) -> usize {
159        self.items.len()
160    }
161
162    /// `true` when the bundle is empty (no signals matched).
163    #[must_use]
164    pub const fn is_empty(&self) -> bool {
165        self.items.is_empty()
166    }
167
168    /// All evidence rows that came from a single source.
169    pub fn for_source(&self, source: EvidenceSource) -> impl Iterator<Item = &Evidence> {
170        self.items.iter().filter(move |e| e.source == source)
171    }
172}
173
174#[cfg(test)]
175#[allow(
176    clippy::unwrap_used,
177    clippy::expect_used,
178    clippy::panic,
179    clippy::indexing_slicing
180)]
181mod tests {
182    use super::*;
183
184    #[test]
185    fn evidence_source_labels_are_stable() {
186        assert_eq!(EvidenceSource::Cookie.label(), "cookie");
187        assert_eq!(EvidenceSource::Header.label(), "header");
188        assert_eq!(EvidenceSource::ChallengeUrl.label(), "challenge_url");
189        assert_eq!(EvidenceSource::BodyMarker.label(), "body_marker");
190        assert_eq!(EvidenceSource::Script.label(), "script");
191    }
192
193    #[test]
194    fn evidence_source_serde_round_trip_is_stable() {
195        for src in [
196            EvidenceSource::Cookie,
197            EvidenceSource::Header,
198            EvidenceSource::ChallengeUrl,
199            EvidenceSource::BodyMarker,
200            EvidenceSource::Script,
201        ] {
202            let json = serde_json::to_string(&src).expect("serialize");
203            let back: EvidenceSource = serde_json::from_str(&json).expect("deserialize");
204            assert_eq!(src, back);
205            assert_eq!(json, format!("\"{}\"", src.label()));
206        }
207    }
208
209    #[test]
210    fn evidence_bundle_filter_by_source_is_correct() {
211        let bundle = EvidenceBundle {
212            items: vec![
213                Evidence {
214                    signal: "x-datadome".to_string(),
215                    source: EvidenceSource::Header,
216                    weight: 5,
217                },
218                Evidence {
219                    signal: "datadome=".to_string(),
220                    source: EvidenceSource::Cookie,
221                    weight: 4,
222                },
223                Evidence {
224                    signal: "cf-ray".to_string(),
225                    source: EvidenceSource::Header,
226                    weight: 5,
227                },
228            ],
229            source_summary: vec![(EvidenceSource::Header, 2), (EvidenceSource::Cookie, 1)]
230                .into_iter()
231                .collect(),
232        };
233        let headers = bundle.for_source(EvidenceSource::Header).count();
234        assert_eq!(headers, 2);
235        assert_eq!(bundle.len(), 3);
236        assert!(!bundle.is_empty());
237    }
238
239    #[test]
240    fn empty_bundle_is_empty() {
241        let bundle = EvidenceBundle::default();
242        assert_eq!(bundle.len(), 0);
243        assert!(bundle.is_empty());
244    }
245}