Skip to main content

stygian_browser/
similarity.rs

1//! Adaptive element similarity search for [`crate::page::PageHandle`].
2//!
3//! Computes a structural fingerprint for a reference DOM node and finds
4//! candidates on the current page that exceed a configurable similarity
5//! threshold, even after class names or depth have shifted.
6
7use crate::page::NodeHandle;
8
9// ─── ElementFingerprint ───────────────────────────────────────────────────────
10
11/// A structural snapshot of a DOM element for similarity comparison.
12///
13/// Serialisable so callers can persist fingerprints across sessions.
14///
15/// # Example
16///
17/// ```
18/// use stygian_browser::similarity::ElementFingerprint;
19///
20/// let fp = ElementFingerprint {
21///     tag: "div".to_string(),
22///     classes: vec!["card".to_string(), "highlighted".to_string()],
23///     attr_names: vec!["data-id".to_string()],
24///     depth: 3,
25/// };
26/// let json = serde_json::to_string(&fp).unwrap();
27/// let back: ElementFingerprint = serde_json::from_str(&json).unwrap();
28/// assert_eq!(fp.tag, back.tag);
29/// ```
30#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
31pub struct ElementFingerprint {
32    /// Lower-case tag name (e.g. `"div"`, `"a"`).
33    pub tag: String,
34    /// Sorted CSS class list (excluding the empty string).
35    pub classes: Vec<String>,
36    /// Sorted attribute names present on the element, excluding `"class"` and
37    /// `"id"`.
38    #[serde(rename = "attrNames")]
39    pub attr_names: Vec<String>,
40    /// Distance from `<body>` in the DOM tree (direct children of `<body>`
41    /// have depth `0`).
42    pub depth: u32,
43}
44
45// ─── SimilarityConfig ─────────────────────────────────────────────────────────
46
47/// Tuning parameters for [`crate::page::PageHandle::find_similar`].
48///
49/// # Example
50///
51/// ```
52/// use stygian_browser::similarity::SimilarityConfig;
53///
54/// let cfg = SimilarityConfig { threshold: 0.5, max_results: 5 };
55/// assert!(cfg.threshold < SimilarityConfig::DEFAULT_THRESHOLD);
56/// ```
57#[derive(Debug, Clone)]
58pub struct SimilarityConfig {
59    /// Minimum score `[0.0, 1.0]` for a candidate to be included in results.
60    ///
61    /// Default: [`DEFAULT_THRESHOLD`](Self::DEFAULT_THRESHOLD).
62    pub threshold: f32,
63    /// Maximum number of results to return.  `0` means unlimited.
64    ///
65    /// Default: `10`.
66    pub max_results: usize,
67}
68
69impl SimilarityConfig {
70    /// Default minimum similarity threshold (`0.7`).
71    pub const DEFAULT_THRESHOLD: f32 = 0.7;
72}
73
74impl Default for SimilarityConfig {
75    fn default() -> Self {
76        Self {
77            threshold: Self::DEFAULT_THRESHOLD,
78            max_results: 10,
79        }
80    }
81}
82
83// ─── SimilarMatch ─────────────────────────────────────────────────────────────
84
85/// A candidate element that exceeded the similarity threshold.
86pub struct SimilarMatch {
87    /// The matching node handle.
88    pub node: NodeHandle,
89    /// Similarity score in `[0.0, 1.0]`.
90    pub score: f32,
91}
92
93// ─── Scoring ──────────────────────────────────────────────────────────────────
94
95/// Compute the weighted Jaccard similarity between two element fingerprints.
96///
97/// Weights:
98/// | Component | Weight |
99/// |-----------|--------|
100/// | Tag name match | 0.40 |
101/// | Class list Jaccard | 0.35 |
102/// | Attribute name Jaccard | 0.15 |
103/// | Depth proximity | 0.10 |
104///
105/// Returns a score in `[0.0, 1.0]`.
106///
107/// # Example
108///
109/// ```
110/// use stygian_browser::similarity::{ElementFingerprint, jaccard_weighted};
111///
112/// let a = ElementFingerprint { tag: "div".into(), classes: vec!["foo".into()], attr_names: vec![], depth: 2 };
113/// let b = a.clone();
114/// assert!((jaccard_weighted(&a, &b) - 1.0).abs() < 1e-6);
115/// ```
116#[must_use]
117pub fn jaccard_weighted(reference: &ElementFingerprint, candidate: &ElementFingerprint) -> f32 {
118    let tag_score = if reference.tag == candidate.tag {
119        1.0_f32
120    } else {
121        0.0_f32
122    };
123
124    let class_score = jaccard_sets(&reference.classes, &candidate.classes);
125    let attr_score = jaccard_sets(&reference.attr_names, &candidate.attr_names);
126
127    // DOM tree depth is always small (< 1000 in practice); truncate to u16
128    // for a lossless f32 conversion (u16 fits exactly in f32's 23-bit mantissa).
129    let ref_depth = f32::from(u16::try_from(reference.depth).unwrap_or(u16::MAX));
130    let cand_depth = f32::from(u16::try_from(candidate.depth).unwrap_or(u16::MAX));
131    let depth_diff = (ref_depth - cand_depth).abs();
132    let depth_score = 1.0_f32 / (1.0_f32 + depth_diff);
133
134    depth_score.mul_add(
135        0.1_f32,
136        attr_score.mul_add(0.15_f32, tag_score.mul_add(0.4_f32, class_score * 0.35_f32)),
137    )
138}
139
140/// Compute Jaccard similarity (|intersection| / |union|) for two **sorted**
141/// string slices.
142///
143/// Returns `1.0` when both slices are empty (they are identical).
144fn jaccard_sets(a: &[String], b: &[String]) -> f32 {
145    if a.is_empty() && b.is_empty() {
146        return 1.0_f32;
147    }
148    let mut intersection: usize = 0;
149    let mut i = 0_usize;
150    let mut j = 0_usize;
151    while i < a.len() && j < b.len() {
152        let (Some(ai), Some(bj)) = (a.get(i), b.get(j)) else {
153            break;
154        };
155        match ai.cmp(bj) {
156            std::cmp::Ordering::Equal => {
157                intersection += 1;
158                i += 1;
159                j += 1;
160            }
161            std::cmp::Ordering::Less => i += 1,
162            std::cmp::Ordering::Greater => j += 1,
163        }
164    }
165    let union = a.len() + b.len() - intersection;
166    // union > 0 because at least one slice is non-empty (guarded above)
167    // Class/attribute counts are tiny (< 100); u16 fits losslessly in f32.
168    let i_f = f32::from(u16::try_from(intersection).unwrap_or(u16::MAX));
169    let u_f = f32::from(u16::try_from(union).unwrap_or(u16::MAX));
170    i_f / u_f
171}
172
173// ─── Tests ────────────────────────────────────────────────────────────────────
174
175#[cfg(test)]
176#[allow(clippy::expect_used)] // serde failures in deterministic tests are programmer errors
177mod tests {
178    use super::*;
179
180    fn fp(tag: &str, classes: &[&str], attrs: &[&str], depth: u32) -> ElementFingerprint {
181        ElementFingerprint {
182            tag: tag.to_string(),
183            classes: classes.iter().map(|s| (*s).to_string()).collect(),
184            attr_names: attrs.iter().map(|s| (*s).to_string()).collect(),
185            depth,
186        }
187    }
188
189    #[test]
190    fn jaccard_identical() {
191        let a = fp("div", &["card", "highlighted"], &["data-id"], 3);
192        let b = a.clone();
193        let score = jaccard_weighted(&a, &b);
194        assert!(
195            (score - 1.0_f32).abs() < 1e-5_f32,
196            "identical fingerprints should score 1.0, got {score}"
197        );
198    }
199
200    #[test]
201    fn jaccard_disjoint() {
202        let a = fp("div", &["foo", "bar"], &["data-x"], 0);
203        let b = fp("span", &["baz", "qux"], &["data-y"], 20);
204        let score = jaccard_weighted(&a, &b);
205        // tag=0, classes=0, attrs=0; depth_prox = 1/(1+20) ≈ 0.0476; weight=0.1
206        // expected ≈ 0.00476
207        assert!(
208            score < 0.05_f32,
209            "disjoint fingerprints should score near 0, got {score}"
210        );
211        assert!(score >= 0.0_f32, "score must be non-negative, got {score}");
212    }
213
214    #[test]
215    fn jaccard_partial() {
216        // Same tag, half classes in common, no attrs, same depth
217        let a = fp("div", &["a", "b"], &[], 2);
218        let b = fp("div", &["a", "c"], &[], 2);
219        let score = jaccard_weighted(&a, &b);
220        // tag=1*0.4=0.4, classes=jaccard({a,b},{a,c})=1/3≈0.333*0.35≈0.1167
221        // attrs=both empty→1.0*0.15=0.15, depth=1/(1+0)=1.0*0.1=0.1
222        // total ≈ 0.4 + 0.1167 + 0.15 + 0.1 = 0.7667
223        assert!(
224            score > 0.5_f32,
225            "partial-match fingerprint should score > 0.5, got {score}"
226        );
227        assert!(
228            score < 0.9_f32,
229            "partial-match fingerprint should score < 0.9, got {score}"
230        );
231    }
232
233    #[test]
234    fn similarity_config_default_threshold() {
235        assert!(
236            (SimilarityConfig::DEFAULT_THRESHOLD - 0.7_f32).abs() < f32::EPSILON,
237            "DEFAULT_THRESHOLD should be 0.7"
238        );
239        let cfg = SimilarityConfig::default();
240        assert!(
241            (cfg.threshold - SimilarityConfig::DEFAULT_THRESHOLD).abs() < f32::EPSILON,
242            "default threshold should equal DEFAULT_THRESHOLD"
243        );
244        assert_eq!(cfg.max_results, 10);
245    }
246
247    #[test]
248    fn fingerprint_serde_roundtrip() {
249        let original = ElementFingerprint {
250            tag: "section".to_string(),
251            classes: vec!["main".to_string(), "wrapper".to_string()],
252            attr_names: vec!["aria-label".to_string(), "data-section".to_string()],
253            depth: 5,
254        };
255        let json = serde_json::to_string(&original).expect("serialize");
256        let decoded: ElementFingerprint = serde_json::from_str(&json).expect("deserialize");
257        assert_eq!(original, decoded);
258    }
259
260    #[test]
261    fn fingerprint_serde_attr_names_key() {
262        // The JSON key must be "attrNames" (camelCase) to match the JS output.
263        let fp_val = ElementFingerprint {
264            tag: "a".to_string(),
265            classes: vec![],
266            attr_names: vec!["href".to_string()],
267            depth: 1,
268        };
269        let json = serde_json::to_string(&fp_val).expect("serialize");
270        assert!(
271            json.contains("\"attrNames\""),
272            "JSON key must be 'attrNames', got: {json}"
273        );
274    }
275}