Skip to main content

stygian_browser/
similarity.rs

1//! Adaptive element similarity search for [`crate::page::PageHandle`].
2//!
3//! Computes a structural fingerprint for a reference DOM node and finds
4//! candidates on the current page that exceed a configurable similarity
5//! threshold, even after class names or depth have shifted.
6
7use crate::page::NodeHandle;
8
9// ─── ElementFingerprint ───────────────────────────────────────────────────────
10
11/// A structural snapshot of a DOM element for similarity comparison.
12///
13/// Serialisable so callers can persist fingerprints across sessions.
14///
15/// # Example
16///
17/// ```
18/// use stygian_browser::similarity::ElementFingerprint;
19///
20/// let fp = ElementFingerprint {
21///     tag: "div".to_string(),
22///     classes: vec!["card".to_string(), "highlighted".to_string()],
23///     attr_names: vec!["data-id".to_string()],
24///     depth: 3,
25/// };
26/// let json = serde_json::to_string(&fp).unwrap();
27/// let back: ElementFingerprint = serde_json::from_str(&json).unwrap();
28/// assert_eq!(fp.tag, back.tag);
29/// ```
30#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
31pub struct ElementFingerprint {
32    /// Lower-case tag name (e.g. `"div"`, `"a"`).
33    pub tag: String,
34    /// Sorted CSS class list (excluding the empty string).
35    pub classes: Vec<String>,
36    /// Sorted attribute names present on the element, excluding `"class"` and
37    /// `"id"`.
38    #[serde(rename = "attrNames")]
39    pub attr_names: Vec<String>,
40    /// Distance from `<body>` in the DOM tree (direct children of `<body>`
41    /// have depth `0`).
42    pub depth: u32,
43}
44
45// ─── SimilarityConfig ─────────────────────────────────────────────────────────
46
47/// Tuning parameters for [`crate::page::PageHandle::find_similar`].
48///
49/// # Example
50///
51/// ```
52/// use stygian_browser::similarity::SimilarityConfig;
53///
54/// let cfg = SimilarityConfig { threshold: 0.5, max_results: 5 };
55/// assert!(cfg.threshold < SimilarityConfig::DEFAULT_THRESHOLD);
56/// ```
57#[derive(Debug, Clone)]
58pub struct SimilarityConfig {
59    /// Minimum score `[0.0, 1.0]` for a candidate to be included in results.
60    ///
61    /// Default: [`DEFAULT_THRESHOLD`](Self::DEFAULT_THRESHOLD).
62    pub threshold: f32,
63    /// Maximum number of results to return.  `0` means unlimited.
64    ///
65    /// Default: `10`.
66    pub max_results: usize,
67}
68
69impl SimilarityConfig {
70    /// Default minimum similarity threshold (`0.7`).
71    pub const DEFAULT_THRESHOLD: f32 = 0.7;
72}
73
74impl Default for SimilarityConfig {
75    fn default() -> Self {
76        Self {
77            threshold: Self::DEFAULT_THRESHOLD,
78            max_results: 10,
79        }
80    }
81}
82
83// ─── SimilarMatch ─────────────────────────────────────────────────────────────
84
85/// A candidate element that exceeded the similarity threshold.
86pub struct SimilarMatch {
87    /// The matching node handle.
88    pub node: NodeHandle,
89    /// Similarity score in `[0.0, 1.0]`.
90    pub score: f32,
91}
92
93// ─── Scoring ──────────────────────────────────────────────────────────────────
94
95/// Compute the weighted Jaccard similarity between two element fingerprints.
96///
97/// Weights:
98/// | Component | Weight |
99/// |-----------|--------|
100/// | Tag name match | 0.40 |
101/// | Class list Jaccard | 0.35 |
102/// | Attribute name Jaccard | 0.15 |
103/// | Depth proximity | 0.10 |
104///
105/// Returns a score in `[0.0, 1.0]`.
106///
107/// # Example
108///
109/// ```
110/// use stygian_browser::similarity::{ElementFingerprint, jaccard_weighted};
111///
112/// let a = ElementFingerprint { tag: "div".into(), classes: vec!["foo".into()], attr_names: vec![], depth: 2 };
113/// let b = a.clone();
114/// assert!((jaccard_weighted(&a, &b) - 1.0).abs() < 1e-6);
115/// ```
116pub fn jaccard_weighted(reference: &ElementFingerprint, candidate: &ElementFingerprint) -> f32 {
117    let tag_score = if reference.tag == candidate.tag {
118        1.0_f32
119    } else {
120        0.0_f32
121    };
122
123    let class_score = jaccard_sets(&reference.classes, &candidate.classes);
124    let attr_score = jaccard_sets(&reference.attr_names, &candidate.attr_names);
125
126    // DOM tree depth is always small (< 1000 in practice); truncate to u16
127    // for a lossless f32 conversion (u16 fits exactly in f32's 23-bit mantissa).
128    let ref_depth = f32::from(u16::try_from(reference.depth).unwrap_or(u16::MAX));
129    let cand_depth = f32::from(u16::try_from(candidate.depth).unwrap_or(u16::MAX));
130    let depth_diff = (ref_depth - cand_depth).abs();
131    let depth_score = 1.0_f32 / (1.0_f32 + depth_diff);
132
133    depth_score.mul_add(
134        0.1_f32,
135        attr_score.mul_add(0.15_f32, tag_score.mul_add(0.4_f32, class_score * 0.35_f32)),
136    )
137}
138
139/// Compute Jaccard similarity (|intersection| / |union|) for two **sorted**
140/// string slices.
141///
142/// Returns `1.0` when both slices are empty (they are identical).
143fn jaccard_sets(a: &[String], b: &[String]) -> f32 {
144    if a.is_empty() && b.is_empty() {
145        return 1.0_f32;
146    }
147    let mut intersection: usize = 0;
148    let mut i = 0_usize;
149    let mut j = 0_usize;
150    while i < a.len() && j < b.len() {
151        let (Some(ai), Some(bj)) = (a.get(i), b.get(j)) else {
152            break;
153        };
154        match ai.cmp(bj) {
155            std::cmp::Ordering::Equal => {
156                intersection += 1;
157                i += 1;
158                j += 1;
159            }
160            std::cmp::Ordering::Less => i += 1,
161            std::cmp::Ordering::Greater => j += 1,
162        }
163    }
164    let union = a.len() + b.len() - intersection;
165    // union > 0 because at least one slice is non-empty (guarded above)
166    // Class/attribute counts are tiny (< 100); u16 fits losslessly in f32.
167    let i_f = f32::from(u16::try_from(intersection).unwrap_or(u16::MAX));
168    let u_f = f32::from(u16::try_from(union).unwrap_or(u16::MAX));
169    i_f / u_f
170}
171
172// ─── Tests ────────────────────────────────────────────────────────────────────
173
174#[cfg(test)]
175#[allow(clippy::expect_used)] // serde failures in deterministic tests are programmer errors
176mod tests {
177    use super::*;
178
179    fn fp(tag: &str, classes: &[&str], attrs: &[&str], depth: u32) -> ElementFingerprint {
180        ElementFingerprint {
181            tag: tag.to_string(),
182            classes: classes.iter().map(|s| (*s).to_string()).collect(),
183            attr_names: attrs.iter().map(|s| (*s).to_string()).collect(),
184            depth,
185        }
186    }
187
188    #[test]
189    fn jaccard_identical() {
190        let a = fp("div", &["card", "highlighted"], &["data-id"], 3);
191        let b = a.clone();
192        let score = jaccard_weighted(&a, &b);
193        assert!(
194            (score - 1.0_f32).abs() < 1e-5_f32,
195            "identical fingerprints should score 1.0, got {score}"
196        );
197    }
198
199    #[test]
200    fn jaccard_disjoint() {
201        let a = fp("div", &["foo", "bar"], &["data-x"], 0);
202        let b = fp("span", &["baz", "qux"], &["data-y"], 20);
203        let score = jaccard_weighted(&a, &b);
204        // tag=0, classes=0, attrs=0; depth_prox = 1/(1+20) ≈ 0.0476; weight=0.1
205        // expected ≈ 0.00476
206        assert!(
207            score < 0.05_f32,
208            "disjoint fingerprints should score near 0, got {score}"
209        );
210        assert!(score >= 0.0_f32, "score must be non-negative, got {score}");
211    }
212
213    #[test]
214    fn jaccard_partial() {
215        // Same tag, half classes in common, no attrs, same depth
216        let a = fp("div", &["a", "b"], &[], 2);
217        let b = fp("div", &["a", "c"], &[], 2);
218        let score = jaccard_weighted(&a, &b);
219        // tag=1*0.4=0.4, classes=jaccard({a,b},{a,c})=1/3≈0.333*0.35≈0.1167
220        // attrs=both empty→1.0*0.15=0.15, depth=1/(1+0)=1.0*0.1=0.1
221        // total ≈ 0.4 + 0.1167 + 0.15 + 0.1 = 0.7667
222        assert!(
223            score > 0.5_f32,
224            "partial-match fingerprint should score > 0.5, got {score}"
225        );
226        assert!(
227            score < 0.9_f32,
228            "partial-match fingerprint should score < 0.9, got {score}"
229        );
230    }
231
232    #[test]
233    fn similarity_config_default_threshold() {
234        assert!(
235            (SimilarityConfig::DEFAULT_THRESHOLD - 0.7_f32).abs() < f32::EPSILON,
236            "DEFAULT_THRESHOLD should be 0.7"
237        );
238        let cfg = SimilarityConfig::default();
239        assert!(
240            (cfg.threshold - SimilarityConfig::DEFAULT_THRESHOLD).abs() < f32::EPSILON,
241            "default threshold should equal DEFAULT_THRESHOLD"
242        );
243        assert_eq!(cfg.max_results, 10);
244    }
245
246    #[test]
247    fn fingerprint_serde_roundtrip() {
248        let original = ElementFingerprint {
249            tag: "section".to_string(),
250            classes: vec!["main".to_string(), "wrapper".to_string()],
251            attr_names: vec!["aria-label".to_string(), "data-section".to_string()],
252            depth: 5,
253        };
254        let json = serde_json::to_string(&original).expect("serialize");
255        let decoded: ElementFingerprint = serde_json::from_str(&json).expect("deserialize");
256        assert_eq!(original, decoded);
257    }
258
259    #[test]
260    fn fingerprint_serde_attr_names_key() {
261        // The JSON key must be "attrNames" (camelCase) to match the JS output.
262        let fp_val = ElementFingerprint {
263            tag: "a".to_string(),
264            classes: vec![],
265            attr_names: vec!["href".to_string()],
266            depth: 1,
267        };
268        let json = serde_json::to_string(&fp_val).expect("serialize");
269        assert!(
270            json.contains("\"attrNames\""),
271            "JSON key must be 'attrNames', got: {json}"
272        );
273    }
274}