stygian_browser/similarity.rs
1//! Adaptive element similarity search for [`crate::page::PageHandle`].
2//!
3//! Computes a structural fingerprint for a reference DOM node and finds
4//! candidates on the current page that exceed a configurable similarity
5//! threshold, even after class names or depth have shifted.
6
7use crate::page::NodeHandle;
8
9// ─── ElementFingerprint ───────────────────────────────────────────────────────
10
11/// A structural snapshot of a DOM element for similarity comparison.
12///
13/// Serialisable so callers can persist fingerprints across sessions.
14///
15/// # Example
16///
17/// ```
18/// use stygian_browser::similarity::ElementFingerprint;
19///
20/// let fp = ElementFingerprint {
21/// tag: "div".to_string(),
22/// classes: vec!["card".to_string(), "highlighted".to_string()],
23/// attr_names: vec!["data-id".to_string()],
24/// depth: 3,
25/// };
26/// let json = serde_json::to_string(&fp).unwrap();
27/// let back: ElementFingerprint = serde_json::from_str(&json).unwrap();
28/// assert_eq!(fp.tag, back.tag);
29/// ```
30#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
31pub struct ElementFingerprint {
32 /// Lower-case tag name (e.g. `"div"`, `"a"`).
33 pub tag: String,
34 /// Sorted CSS class list (excluding the empty string).
35 pub classes: Vec<String>,
36 /// Sorted attribute names present on the element, excluding `"class"` and
37 /// `"id"`.
38 #[serde(rename = "attrNames")]
39 pub attr_names: Vec<String>,
40 /// Distance from `<body>` in the DOM tree (direct children of `<body>`
41 /// have depth `0`).
42 pub depth: u32,
43}
44
45// ─── SimilarityConfig ─────────────────────────────────────────────────────────
46
47/// Tuning parameters for [`crate::page::PageHandle::find_similar`].
48///
49/// # Example
50///
51/// ```
52/// use stygian_browser::similarity::SimilarityConfig;
53///
54/// let cfg = SimilarityConfig { threshold: 0.5, max_results: 5 };
55/// assert!(cfg.threshold < SimilarityConfig::DEFAULT_THRESHOLD);
56/// ```
57#[derive(Debug, Clone)]
58pub struct SimilarityConfig {
59 /// Minimum score `[0.0, 1.0]` for a candidate to be included in results.
60 ///
61 /// Default: [`DEFAULT_THRESHOLD`](Self::DEFAULT_THRESHOLD).
62 pub threshold: f32,
63 /// Maximum number of results to return. `0` means unlimited.
64 ///
65 /// Default: `10`.
66 pub max_results: usize,
67}
68
69impl SimilarityConfig {
70 /// Default minimum similarity threshold (`0.7`).
71 pub const DEFAULT_THRESHOLD: f32 = 0.7;
72}
73
74impl Default for SimilarityConfig {
75 fn default() -> Self {
76 Self {
77 threshold: Self::DEFAULT_THRESHOLD,
78 max_results: 10,
79 }
80 }
81}
82
83// ─── SimilarMatch ─────────────────────────────────────────────────────────────
84
85/// A candidate element that exceeded the similarity threshold.
86pub struct SimilarMatch {
87 /// The matching node handle.
88 pub node: NodeHandle,
89 /// Similarity score in `[0.0, 1.0]`.
90 pub score: f32,
91}
92
93// ─── Scoring ──────────────────────────────────────────────────────────────────
94
95/// Compute the weighted Jaccard similarity between two element fingerprints.
96///
97/// Weights:
98/// | Component | Weight |
99/// |-----------|--------|
100/// | Tag name match | 0.40 |
101/// | Class list Jaccard | 0.35 |
102/// | Attribute name Jaccard | 0.15 |
103/// | Depth proximity | 0.10 |
104///
105/// Returns a score in `[0.0, 1.0]`.
106///
107/// # Example
108///
109/// ```
110/// use stygian_browser::similarity::{ElementFingerprint, jaccard_weighted};
111///
112/// let a = ElementFingerprint { tag: "div".into(), classes: vec!["foo".into()], attr_names: vec![], depth: 2 };
113/// let b = a.clone();
114/// assert!((jaccard_weighted(&a, &b) - 1.0).abs() < 1e-6);
115/// ```
116#[must_use]
117pub fn jaccard_weighted(reference: &ElementFingerprint, candidate: &ElementFingerprint) -> f32 {
118 let tag_score = if reference.tag == candidate.tag {
119 1.0_f32
120 } else {
121 0.0_f32
122 };
123
124 let class_score = jaccard_sets(&reference.classes, &candidate.classes);
125 let attr_score = jaccard_sets(&reference.attr_names, &candidate.attr_names);
126
127 // DOM tree depth is always small (< 1000 in practice); truncate to u16
128 // for a lossless f32 conversion (u16 fits exactly in f32's 23-bit mantissa).
129 let ref_depth = f32::from(u16::try_from(reference.depth).unwrap_or(u16::MAX));
130 let cand_depth = f32::from(u16::try_from(candidate.depth).unwrap_or(u16::MAX));
131 let depth_diff = (ref_depth - cand_depth).abs();
132 let depth_score = 1.0_f32 / (1.0_f32 + depth_diff);
133
134 depth_score.mul_add(
135 0.1_f32,
136 attr_score.mul_add(0.15_f32, tag_score.mul_add(0.4_f32, class_score * 0.35_f32)),
137 )
138}
139
140/// Compute Jaccard similarity (|intersection| / |union|) for two **sorted**
141/// string slices.
142///
143/// Returns `1.0` when both slices are empty (they are identical).
144fn jaccard_sets(a: &[String], b: &[String]) -> f32 {
145 if a.is_empty() && b.is_empty() {
146 return 1.0_f32;
147 }
148 let mut intersection: usize = 0;
149 let mut i = 0_usize;
150 let mut j = 0_usize;
151 while i < a.len() && j < b.len() {
152 let (Some(ai), Some(bj)) = (a.get(i), b.get(j)) else {
153 break;
154 };
155 match ai.cmp(bj) {
156 std::cmp::Ordering::Equal => {
157 intersection += 1;
158 i += 1;
159 j += 1;
160 }
161 std::cmp::Ordering::Less => i += 1,
162 std::cmp::Ordering::Greater => j += 1,
163 }
164 }
165 let union = a.len() + b.len() - intersection;
166 // union > 0 because at least one slice is non-empty (guarded above)
167 // Class/attribute counts are tiny (< 100); u16 fits losslessly in f32.
168 let i_f = f32::from(u16::try_from(intersection).unwrap_or(u16::MAX));
169 let u_f = f32::from(u16::try_from(union).unwrap_or(u16::MAX));
170 i_f / u_f
171}
172
173// ─── Tests ────────────────────────────────────────────────────────────────────
174
175#[cfg(test)]
176#[allow(clippy::expect_used)] // serde failures in deterministic tests are programmer errors
177mod tests {
178 use super::*;
179
180 fn fp(tag: &str, classes: &[&str], attrs: &[&str], depth: u32) -> ElementFingerprint {
181 ElementFingerprint {
182 tag: tag.to_string(),
183 classes: classes.iter().map(|s| (*s).to_string()).collect(),
184 attr_names: attrs.iter().map(|s| (*s).to_string()).collect(),
185 depth,
186 }
187 }
188
189 #[test]
190 fn jaccard_identical() {
191 let a = fp("div", &["card", "highlighted"], &["data-id"], 3);
192 let b = a.clone();
193 let score = jaccard_weighted(&a, &b);
194 assert!(
195 (score - 1.0_f32).abs() < 1e-5_f32,
196 "identical fingerprints should score 1.0, got {score}"
197 );
198 }
199
200 #[test]
201 fn jaccard_disjoint() {
202 let a = fp("div", &["foo", "bar"], &["data-x"], 0);
203 let b = fp("span", &["baz", "qux"], &["data-y"], 20);
204 let score = jaccard_weighted(&a, &b);
205 // tag=0, classes=0, attrs=0; depth_prox = 1/(1+20) ≈ 0.0476; weight=0.1
206 // expected ≈ 0.00476
207 assert!(
208 score < 0.05_f32,
209 "disjoint fingerprints should score near 0, got {score}"
210 );
211 assert!(score >= 0.0_f32, "score must be non-negative, got {score}");
212 }
213
214 #[test]
215 fn jaccard_partial() {
216 // Same tag, half classes in common, no attrs, same depth
217 let a = fp("div", &["a", "b"], &[], 2);
218 let b = fp("div", &["a", "c"], &[], 2);
219 let score = jaccard_weighted(&a, &b);
220 // tag=1*0.4=0.4, classes=jaccard({a,b},{a,c})=1/3≈0.333*0.35≈0.1167
221 // attrs=both empty→1.0*0.15=0.15, depth=1/(1+0)=1.0*0.1=0.1
222 // total ≈ 0.4 + 0.1167 + 0.15 + 0.1 = 0.7667
223 assert!(
224 score > 0.5_f32,
225 "partial-match fingerprint should score > 0.5, got {score}"
226 );
227 assert!(
228 score < 0.9_f32,
229 "partial-match fingerprint should score < 0.9, got {score}"
230 );
231 }
232
233 #[test]
234 fn similarity_config_default_threshold() {
235 assert!(
236 (SimilarityConfig::DEFAULT_THRESHOLD - 0.7_f32).abs() < f32::EPSILON,
237 "DEFAULT_THRESHOLD should be 0.7"
238 );
239 let cfg = SimilarityConfig::default();
240 assert!(
241 (cfg.threshold - SimilarityConfig::DEFAULT_THRESHOLD).abs() < f32::EPSILON,
242 "default threshold should equal DEFAULT_THRESHOLD"
243 );
244 assert_eq!(cfg.max_results, 10);
245 }
246
247 #[test]
248 fn fingerprint_serde_roundtrip() {
249 let original = ElementFingerprint {
250 tag: "section".to_string(),
251 classes: vec!["main".to_string(), "wrapper".to_string()],
252 attr_names: vec!["aria-label".to_string(), "data-section".to_string()],
253 depth: 5,
254 };
255 let json = serde_json::to_string(&original).expect("serialize");
256 let decoded: ElementFingerprint = serde_json::from_str(&json).expect("deserialize");
257 assert_eq!(original, decoded);
258 }
259
260 #[test]
261 fn fingerprint_serde_attr_names_key() {
262 // The JSON key must be "attrNames" (camelCase) to match the JS output.
263 let fp_val = ElementFingerprint {
264 tag: "a".to_string(),
265 classes: vec![],
266 attr_names: vec!["href".to_string()],
267 depth: 1,
268 };
269 let json = serde_json::to_string(&fp_val).expect("serialize");
270 assert!(
271 json.contains("\"attrNames\""),
272 "JSON key must be 'attrNames', got: {json}"
273 );
274 }
275}