stygian_browser/similarity.rs
1//! Adaptive element similarity search for [`crate::page::PageHandle`].
2//!
3//! Computes a structural fingerprint for a reference DOM node and finds
4//! candidates on the current page that exceed a configurable similarity
5//! threshold, even after class names or depth have shifted.
6
7use crate::page::NodeHandle;
8
9// ─── ElementFingerprint ───────────────────────────────────────────────────────
10
11/// A structural snapshot of a DOM element for similarity comparison.
12///
13/// Serialisable so callers can persist fingerprints across sessions.
14///
15/// # Example
16///
17/// ```
18/// use stygian_browser::similarity::ElementFingerprint;
19///
20/// let fp = ElementFingerprint {
21/// tag: "div".to_string(),
22/// classes: vec!["card".to_string(), "highlighted".to_string()],
23/// attr_names: vec!["data-id".to_string()],
24/// depth: 3,
25/// };
26/// let json = serde_json::to_string(&fp).unwrap();
27/// let back: ElementFingerprint = serde_json::from_str(&json).unwrap();
28/// assert_eq!(fp.tag, back.tag);
29/// ```
30#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
31pub struct ElementFingerprint {
32 /// Lower-case tag name (e.g. `"div"`, `"a"`).
33 pub tag: String,
34 /// Sorted CSS class list (excluding the empty string).
35 pub classes: Vec<String>,
36 /// Sorted attribute names present on the element, excluding `"class"` and
37 /// `"id"`.
38 #[serde(rename = "attrNames")]
39 pub attr_names: Vec<String>,
40 /// Distance from `<body>` in the DOM tree (direct children of `<body>`
41 /// have depth `0`).
42 pub depth: u32,
43}
44
45// ─── SimilarityConfig ─────────────────────────────────────────────────────────
46
47/// Tuning parameters for [`crate::page::PageHandle::find_similar`].
48///
49/// # Example
50///
51/// ```
52/// use stygian_browser::similarity::SimilarityConfig;
53///
54/// let cfg = SimilarityConfig { threshold: 0.5, max_results: 5 };
55/// assert!(cfg.threshold < SimilarityConfig::DEFAULT_THRESHOLD);
56/// ```
57#[derive(Debug, Clone)]
58pub struct SimilarityConfig {
59 /// Minimum score `[0.0, 1.0]` for a candidate to be included in results.
60 ///
61 /// Default: [`DEFAULT_THRESHOLD`](Self::DEFAULT_THRESHOLD).
62 pub threshold: f32,
63 /// Maximum number of results to return. `0` means unlimited.
64 ///
65 /// Default: `10`.
66 pub max_results: usize,
67}
68
69impl SimilarityConfig {
70 /// Default minimum similarity threshold (`0.7`).
71 pub const DEFAULT_THRESHOLD: f32 = 0.7;
72}
73
74impl Default for SimilarityConfig {
75 fn default() -> Self {
76 Self {
77 threshold: Self::DEFAULT_THRESHOLD,
78 max_results: 10,
79 }
80 }
81}
82
83// ─── SimilarMatch ─────────────────────────────────────────────────────────────
84
85/// A candidate element that exceeded the similarity threshold.
86pub struct SimilarMatch {
87 /// The matching node handle.
88 pub node: NodeHandle,
89 /// Similarity score in `[0.0, 1.0]`.
90 pub score: f32,
91}
92
93// ─── Scoring ──────────────────────────────────────────────────────────────────
94
95/// Compute the weighted Jaccard similarity between two element fingerprints.
96///
97/// Weights:
98/// | Component | Weight |
99/// |-----------|--------|
100/// | Tag name match | 0.40 |
101/// | Class list Jaccard | 0.35 |
102/// | Attribute name Jaccard | 0.15 |
103/// | Depth proximity | 0.10 |
104///
105/// Returns a score in `[0.0, 1.0]`.
106///
107/// # Example
108///
109/// ```
110/// use stygian_browser::similarity::{ElementFingerprint, jaccard_weighted};
111///
112/// let a = ElementFingerprint { tag: "div".into(), classes: vec!["foo".into()], attr_names: vec![], depth: 2 };
113/// let b = a.clone();
114/// assert!((jaccard_weighted(&a, &b) - 1.0).abs() < 1e-6);
115/// ```
116pub fn jaccard_weighted(reference: &ElementFingerprint, candidate: &ElementFingerprint) -> f32 {
117 let tag_score = if reference.tag == candidate.tag {
118 1.0_f32
119 } else {
120 0.0_f32
121 };
122
123 let class_score = jaccard_sets(&reference.classes, &candidate.classes);
124 let attr_score = jaccard_sets(&reference.attr_names, &candidate.attr_names);
125
126 // DOM tree depth is always small (< 1000 in practice); truncate to u16
127 // for a lossless f32 conversion (u16 fits exactly in f32's 23-bit mantissa).
128 let ref_depth = f32::from(u16::try_from(reference.depth).unwrap_or(u16::MAX));
129 let cand_depth = f32::from(u16::try_from(candidate.depth).unwrap_or(u16::MAX));
130 let depth_diff = (ref_depth - cand_depth).abs();
131 let depth_score = 1.0_f32 / (1.0_f32 + depth_diff);
132
133 depth_score.mul_add(
134 0.1_f32,
135 attr_score.mul_add(0.15_f32, tag_score.mul_add(0.4_f32, class_score * 0.35_f32)),
136 )
137}
138
139/// Compute Jaccard similarity (|intersection| / |union|) for two **sorted**
140/// string slices.
141///
142/// Returns `1.0` when both slices are empty (they are identical).
143fn jaccard_sets(a: &[String], b: &[String]) -> f32 {
144 if a.is_empty() && b.is_empty() {
145 return 1.0_f32;
146 }
147 let mut intersection: usize = 0;
148 let mut i = 0_usize;
149 let mut j = 0_usize;
150 while i < a.len() && j < b.len() {
151 let (Some(ai), Some(bj)) = (a.get(i), b.get(j)) else {
152 break;
153 };
154 match ai.cmp(bj) {
155 std::cmp::Ordering::Equal => {
156 intersection += 1;
157 i += 1;
158 j += 1;
159 }
160 std::cmp::Ordering::Less => i += 1,
161 std::cmp::Ordering::Greater => j += 1,
162 }
163 }
164 let union = a.len() + b.len() - intersection;
165 // union > 0 because at least one slice is non-empty (guarded above)
166 // Class/attribute counts are tiny (< 100); u16 fits losslessly in f32.
167 let i_f = f32::from(u16::try_from(intersection).unwrap_or(u16::MAX));
168 let u_f = f32::from(u16::try_from(union).unwrap_or(u16::MAX));
169 i_f / u_f
170}
171
172// ─── Tests ────────────────────────────────────────────────────────────────────
173
174#[cfg(test)]
175#[allow(clippy::expect_used)] // serde failures in deterministic tests are programmer errors
176mod tests {
177 use super::*;
178
179 fn fp(tag: &str, classes: &[&str], attrs: &[&str], depth: u32) -> ElementFingerprint {
180 ElementFingerprint {
181 tag: tag.to_string(),
182 classes: classes.iter().map(|s| (*s).to_string()).collect(),
183 attr_names: attrs.iter().map(|s| (*s).to_string()).collect(),
184 depth,
185 }
186 }
187
188 #[test]
189 fn jaccard_identical() {
190 let a = fp("div", &["card", "highlighted"], &["data-id"], 3);
191 let b = a.clone();
192 let score = jaccard_weighted(&a, &b);
193 assert!(
194 (score - 1.0_f32).abs() < 1e-5_f32,
195 "identical fingerprints should score 1.0, got {score}"
196 );
197 }
198
199 #[test]
200 fn jaccard_disjoint() {
201 let a = fp("div", &["foo", "bar"], &["data-x"], 0);
202 let b = fp("span", &["baz", "qux"], &["data-y"], 20);
203 let score = jaccard_weighted(&a, &b);
204 // tag=0, classes=0, attrs=0; depth_prox = 1/(1+20) ≈ 0.0476; weight=0.1
205 // expected ≈ 0.00476
206 assert!(
207 score < 0.05_f32,
208 "disjoint fingerprints should score near 0, got {score}"
209 );
210 assert!(score >= 0.0_f32, "score must be non-negative, got {score}");
211 }
212
213 #[test]
214 fn jaccard_partial() {
215 // Same tag, half classes in common, no attrs, same depth
216 let a = fp("div", &["a", "b"], &[], 2);
217 let b = fp("div", &["a", "c"], &[], 2);
218 let score = jaccard_weighted(&a, &b);
219 // tag=1*0.4=0.4, classes=jaccard({a,b},{a,c})=1/3≈0.333*0.35≈0.1167
220 // attrs=both empty→1.0*0.15=0.15, depth=1/(1+0)=1.0*0.1=0.1
221 // total ≈ 0.4 + 0.1167 + 0.15 + 0.1 = 0.7667
222 assert!(
223 score > 0.5_f32,
224 "partial-match fingerprint should score > 0.5, got {score}"
225 );
226 assert!(
227 score < 0.9_f32,
228 "partial-match fingerprint should score < 0.9, got {score}"
229 );
230 }
231
232 #[test]
233 fn similarity_config_default_threshold() {
234 assert!(
235 (SimilarityConfig::DEFAULT_THRESHOLD - 0.7_f32).abs() < f32::EPSILON,
236 "DEFAULT_THRESHOLD should be 0.7"
237 );
238 let cfg = SimilarityConfig::default();
239 assert!(
240 (cfg.threshold - SimilarityConfig::DEFAULT_THRESHOLD).abs() < f32::EPSILON,
241 "default threshold should equal DEFAULT_THRESHOLD"
242 );
243 assert_eq!(cfg.max_results, 10);
244 }
245
246 #[test]
247 fn fingerprint_serde_roundtrip() {
248 let original = ElementFingerprint {
249 tag: "section".to_string(),
250 classes: vec!["main".to_string(), "wrapper".to_string()],
251 attr_names: vec!["aria-label".to_string(), "data-section".to_string()],
252 depth: 5,
253 };
254 let json = serde_json::to_string(&original).expect("serialize");
255 let decoded: ElementFingerprint = serde_json::from_str(&json).expect("deserialize");
256 assert_eq!(original, decoded);
257 }
258
259 #[test]
260 fn fingerprint_serde_attr_names_key() {
261 // The JSON key must be "attrNames" (camelCase) to match the JS output.
262 let fp_val = ElementFingerprint {
263 tag: "a".to_string(),
264 classes: vec![],
265 attr_names: vec!["href".to_string()],
266 depth: 1,
267 };
268 let json = serde_json::to_string(&fp_val).expect("serialize");
269 assert!(
270 json.contains("\"attrNames\""),
271 "JSON key must be 'attrNames', got: {json}"
272 );
273 }
274}