Skip to main content

stygian_graph/adapters/
sitemap.rs

1//! Sitemap / sitemap-index [`ScrapingService`](crate::ports::ScrapingService) adapter
2//!
3//! Parses XML sitemaps (`<urlset>`) and sitemap index files (`<sitemapindex>`),
4//! emitting discovered URLs with metadata for downstream pipeline nodes.
5//!
6//! Supports:
7//! - Standard sitemaps (`<urlset>` with `<url>` entries)
8//! - Sitemap index files (`<sitemapindex>` with nested `<sitemap>` refs)
9//! - Gzipped sitemaps (`.xml.gz`) via `flate2`
10//! - Filtering by `lastmod` date range or `priority` threshold
11//!
12//! # Example
13//!
14//! ```no_run
15//! use stygian_graph::adapters::sitemap::SitemapAdapter;
16//! use stygian_graph::ports::{ScrapingService, ServiceInput};
17//! use serde_json::json;
18//!
19//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
20//! let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
21//! let input = ServiceInput {
22//!     url: "https://example.com/sitemap.xml".into(),
23//!     params: json!({}),
24//! };
25//! let output = adapter.execute(input).await.unwrap();
26//! println!("{}", output.data); // JSON array of discovered URLs
27//! # });
28//! ```
29
30use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
32use async_trait::async_trait;
33use flate2::read::GzDecoder;
34use quick_xml::Reader;
35use quick_xml::events::Event;
36use serde::{Deserialize, Serialize};
37use serde_json::json;
38use std::io::Read;
39
40// ─── Domain types ─────────────────────────────────────────────────────────────
41
42/// A single URL entry extracted from a sitemap.
43///
44/// # Example
45///
46/// ```
47/// use stygian_graph::adapters::sitemap::SitemapEntry;
48///
49/// let entry = SitemapEntry {
50///     loc: "https://example.com/page".into(),
51///     lastmod: Some("2026-03-01".into()),
52///     changefreq: Some("weekly".into()),
53///     priority: Some(0.8),
54/// };
55/// ```
56#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub struct SitemapEntry {
58    /// Absolute URL.
59    pub loc: String,
60    /// Last-modified date string (ISO 8601).
61    pub lastmod: Option<String>,
62    /// Change frequency hint.
63    pub changefreq: Option<String>,
64    /// Priority (0.0–1.0).
65    pub priority: Option<f64>,
66}
67
68// ─── Adapter ──────────────────────────────────────────────────────────────────
69
70/// Sitemap / sitemap-index source adapter.
71///
72/// Fetches and parses XML sitemaps, recursively resolving sitemap index files
73/// up to a configurable depth limit.
74///
75/// # Example
76///
77/// ```no_run
78/// use stygian_graph::adapters::sitemap::SitemapAdapter;
79///
80/// let adapter = SitemapAdapter::new(reqwest::Client::new(), 3);
81/// ```
82pub struct SitemapAdapter {
83    client: reqwest::Client,
84    max_depth: usize,
85}
86
87impl SitemapAdapter {
88    /// Create a new sitemap adapter.
89    ///
90    /// `max_depth` controls how many levels of sitemap-index nesting to follow.
91    ///
92    /// # Example
93    ///
94    /// ```
95    /// use stygian_graph::adapters::sitemap::SitemapAdapter;
96    ///
97    /// let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
98    /// ```
99    #[must_use]
100    pub const fn new(client: reqwest::Client, max_depth: usize) -> Self {
101        Self { client, max_depth }
102    }
103
104    /// Fetch raw bytes from a URL, transparently decompressing `.xml.gz`.
105    ///
106    /// # Errors
107    ///
108    /// Returns [`StygianError::Service`] on HTTP or decompression failure.
109    async fn fetch_bytes(&self, url: &str) -> Result<String> {
110        let resp = self.client.get(url).send().await.map_err(|e| {
111            StygianError::Service(ServiceError::Unavailable(format!(
112                "sitemap fetch failed: {e}"
113            )))
114        })?;
115
116        if !resp.status().is_success() {
117            return Err(StygianError::Service(ServiceError::InvalidResponse(
118                format!("sitemap returned HTTP {}", resp.status()),
119            )));
120        }
121
122        let bytes = resp.bytes().await.map_err(|e| {
123            StygianError::Service(ServiceError::Unavailable(format!(
124                "sitemap body read failed: {e}"
125            )))
126        })?;
127
128        // Attempt gzip decompression if URL ends in .gz or content looks gzipped
129        if url.to_ascii_lowercase().ends_with(".gz") || bytes.starts_with(&[0x1f, 0x8b]) {
130            let mut decoder = GzDecoder::new(&bytes[..]);
131            let mut xml = String::new();
132            decoder.read_to_string(&mut xml).map_err(|e| {
133                StygianError::Service(ServiceError::InvalidResponse(format!(
134                    "gzip decompression failed: {e}"
135                )))
136            })?;
137            Ok(xml)
138        } else {
139            String::from_utf8(bytes.to_vec()).map_err(|e| {
140                StygianError::Service(ServiceError::InvalidResponse(format!(
141                    "sitemap not valid UTF-8: {e}"
142                )))
143            })
144        }
145    }
146
147    /// Recursively resolve a sitemap URL, returning all discovered entries.
148    ///
149    /// # Errors
150    ///
151    /// Returns [`StygianError::Service`] on fetch, parse, or depth-limit errors.
152    async fn resolve(&self, url: &str, depth: usize) -> Result<Vec<SitemapEntry>> {
153        if depth > self.max_depth {
154            return Err(StygianError::Service(ServiceError::InvalidResponse(
155                format!(
156                    "sitemap index nesting exceeded max depth ({depth} > {})",
157                    self.max_depth
158                ),
159            )));
160        }
161
162        let xml = self.fetch_bytes(url).await?;
163        let root_kind = detect_root_element(&xml)?;
164
165        match root_kind {
166            RootElement::UrlSet => parse_urlset(&xml),
167            RootElement::SitemapIndex => {
168                let nested_urls = parse_sitemapindex(&xml)?;
169                let mut all = Vec::new();
170                for nested_url in &nested_urls {
171                    let entries = Box::pin(self.resolve(nested_url, depth + 1)).await?;
172                    all.extend(entries);
173                }
174                Ok(all)
175            }
176        }
177    }
178}
179
180#[async_trait]
181impl ScrapingService for SitemapAdapter {
182    /// Fetch and parse a sitemap, returning discovered URLs as JSON.
183    ///
184    /// # Params (optional)
185    ///
186    /// * `min_priority` — f64, filter entries with priority >= this value.
187    /// * `lastmod_after` — string, include only entries with lastmod >= this date.
188    /// * `lastmod_before` — string, include only entries with lastmod <= this date.
189    ///
190    /// # Example
191    ///
192    /// ```no_run
193    /// # use stygian_graph::adapters::sitemap::SitemapAdapter;
194    /// # use stygian_graph::ports::{ScrapingService, ServiceInput};
195    /// # use serde_json::json;
196    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
197    /// let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
198    /// let input = ServiceInput {
199    ///     url: "https://example.com/sitemap.xml".into(),
200    ///     params: json!({ "min_priority": 0.5 }),
201    /// };
202    /// let out = adapter.execute(input).await.unwrap();
203    /// # });
204    /// ```
205    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
206        let mut entries = self.resolve(&input.url, 0).await?;
207
208        // Apply optional filters
209        if let Some(min_pri) = input
210            .params
211            .get("min_priority")
212            .and_then(serde_json::Value::as_f64)
213        {
214            entries.retain(|e| e.priority.unwrap_or(0.0) >= min_pri);
215        }
216        if let Some(after) = input.params.get("lastmod_after").and_then(|v| v.as_str()) {
217            entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= after));
218        }
219        if let Some(before) = input.params.get("lastmod_before").and_then(|v| v.as_str()) {
220            entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm <= before));
221        }
222
223        let count = entries.len();
224        let data = serde_json::to_string(&entries).map_err(|e| {
225            StygianError::Service(ServiceError::InvalidResponse(format!(
226                "sitemap serialization failed: {e}"
227            )))
228        })?;
229
230        Ok(ServiceOutput {
231            data,
232            metadata: json!({
233                "source": "sitemap",
234                "url_count": count,
235                "source_url": input.url,
236            }),
237        })
238    }
239
240    fn name(&self) -> &'static str {
241        "sitemap"
242    }
243}
244
245// ─── XML parsing helpers ──────────────────────────────────────────────────────
246
247#[derive(Debug, PartialEq)]
248enum RootElement {
249    UrlSet,
250    SitemapIndex,
251}
252
253/// Detect whether the XML document is a `<urlset>` or `<sitemapindex>`.
254fn detect_root_element(xml: &str) -> Result<RootElement> {
255    let mut reader = Reader::from_str(xml);
256    let mut buf = Vec::new();
257
258    loop {
259        match reader.read_event_into(&mut buf) {
260            Ok(Event::Start(ref e) | Event::Empty(ref e)) => {
261                let local = e.local_name();
262                let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
263                return match name {
264                    "urlset" => Ok(RootElement::UrlSet),
265                    "sitemapindex" => Ok(RootElement::SitemapIndex),
266                    _ => Err(StygianError::Service(ServiceError::InvalidResponse(
267                        format!("unexpected XML root element: <{name}>"),
268                    ))),
269                };
270            }
271            Ok(Event::Eof) => {
272                return Err(StygianError::Service(ServiceError::InvalidResponse(
273                    "empty or invalid XML document".into(),
274                )));
275            }
276            Err(e) => {
277                return Err(StygianError::Service(ServiceError::InvalidResponse(
278                    format!("XML parse error: {e}"),
279                )));
280            }
281            _ => {} // skip processing instructions, comments, decl
282        }
283        buf.clear();
284    }
285}
286
287/// Parse a `<urlset>` document into a list of [`SitemapEntry`].
288fn parse_urlset(xml: &str) -> Result<Vec<SitemapEntry>> {
289    let mut reader = Reader::from_str(xml);
290    let mut buf = Vec::new();
291    let mut entries = Vec::new();
292
293    // Current entry being built
294    let mut current: Option<SitemapEntryBuilder> = None;
295    let mut current_tag: Option<String> = None;
296
297    loop {
298        match reader.read_event_into(&mut buf) {
299            Ok(Event::Start(ref e)) => {
300                let name = local_name(e);
301                match name.as_str() {
302                    "url" => {
303                        current = Some(SitemapEntryBuilder::default());
304                    }
305                    "loc" | "lastmod" | "changefreq" | "priority" => {
306                        current_tag = Some(name);
307                    }
308                    _ => {}
309                }
310            }
311            Ok(Event::Text(ref t)) => {
312                if let (Some(builder), Some(tag)) = (&mut current, &current_tag) {
313                    let text = t.xml10_content().unwrap_or_default().trim().to_string();
314                    if !text.is_empty() {
315                        match tag.as_str() {
316                            "loc" => builder.loc = Some(text),
317                            "lastmod" => builder.lastmod = Some(text),
318                            "changefreq" => builder.changefreq = Some(text),
319                            "priority" => builder.priority = text.parse().ok(),
320                            _ => {}
321                        }
322                    }
323                }
324            }
325            Ok(Event::End(ref e)) => {
326                let name = local_name_end(e);
327                if name == "url"
328                    && let Some(builder) = current.take()
329                    && let Some(entry) = builder.build()
330                {
331                    entries.push(entry);
332                }
333                if current_tag.as_deref() == Some(&name) {
334                    current_tag = None;
335                }
336            }
337            Ok(Event::Eof) => break,
338            Err(e) => {
339                return Err(StygianError::Service(ServiceError::InvalidResponse(
340                    format!("sitemap XML parse error: {e}"),
341                )));
342            }
343            _ => {}
344        }
345        buf.clear();
346    }
347
348    Ok(entries)
349}
350
351/// Parse a `<sitemapindex>` document, returning the `<loc>` URLs of nested sitemaps.
352fn parse_sitemapindex(xml: &str) -> Result<Vec<String>> {
353    let mut reader = Reader::from_str(xml);
354    let mut buf = Vec::new();
355    let mut urls = Vec::new();
356    let mut in_sitemap = false;
357    let mut in_loc = false;
358
359    loop {
360        match reader.read_event_into(&mut buf) {
361            Ok(Event::Start(ref e)) => {
362                let name = local_name(e);
363                match name.as_str() {
364                    "sitemap" => in_sitemap = true,
365                    "loc" if in_sitemap => in_loc = true,
366                    _ => {}
367                }
368            }
369            Ok(Event::Text(ref t)) if in_loc => {
370                let text = t.xml10_content().unwrap_or_default().trim().to_string();
371                if !text.is_empty() {
372                    urls.push(text);
373                }
374            }
375            Ok(Event::End(ref e)) => {
376                let name = local_name_end(e);
377                match name.as_str() {
378                    "sitemap" => {
379                        in_sitemap = false;
380                        in_loc = false;
381                    }
382                    "loc" => in_loc = false,
383                    _ => {}
384                }
385            }
386            Ok(Event::Eof) => break,
387            Err(e) => {
388                return Err(StygianError::Service(ServiceError::InvalidResponse(
389                    format!("sitemapindex XML parse error: {e}"),
390                )));
391            }
392            _ => {}
393        }
394        buf.clear();
395    }
396
397    Ok(urls)
398}
399
400/// Extract the local name (without namespace prefix) from a start element.
401fn local_name(e: &quick_xml::events::BytesStart<'_>) -> String {
402    std::str::from_utf8(e.local_name().as_ref())
403        .unwrap_or("")
404        .to_string()
405}
406
407/// Extract the local name from an end element.
408fn local_name_end(e: &quick_xml::events::BytesEnd<'_>) -> String {
409    std::str::from_utf8(e.local_name().as_ref())
410        .unwrap_or("")
411        .to_string()
412}
413
414// ─── Builder ──────────────────────────────────────────────────────────────────
415
416#[derive(Default)]
417struct SitemapEntryBuilder {
418    loc: Option<String>,
419    lastmod: Option<String>,
420    changefreq: Option<String>,
421    priority: Option<f64>,
422}
423
424impl SitemapEntryBuilder {
425    fn build(self) -> Option<SitemapEntry> {
426        Some(SitemapEntry {
427            loc: self.loc?,
428            lastmod: self.lastmod,
429            changefreq: self.changefreq,
430            priority: self.priority,
431        })
432    }
433}
434
435// ─── Tests ────────────────────────────────────────────────────────────────────
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440
441    const URLSET_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
442<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
443  <url>
444    <loc>https://example.com/page1</loc>
445    <lastmod>2026-03-01</lastmod>
446    <changefreq>daily</changefreq>
447    <priority>0.8</priority>
448  </url>
449  <url>
450    <loc>https://example.com/page2</loc>
451    <lastmod>2026-02-15</lastmod>
452    <priority>0.5</priority>
453  </url>
454  <url>
455    <loc>https://example.com/page3</loc>
456  </url>
457</urlset>"#;
458
459    const SITEMAPINDEX_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
460<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
461  <sitemap>
462    <loc>https://example.com/sitemap1.xml</loc>
463    <lastmod>2026-03-01</lastmod>
464  </sitemap>
465  <sitemap>
466    <loc>https://example.com/sitemap2.xml.gz</loc>
467  </sitemap>
468</sitemapindex>"#;
469
470    #[test]
471    fn parse_urlset_with_3_urls() -> std::result::Result<(), Box<dyn std::error::Error>> {
472        let entries = parse_urlset(URLSET_XML)?;
473        assert_eq!(entries.len(), 3);
474
475        let first = entries.first().ok_or("missing first entry")?;
476        assert_eq!(first.loc, "https://example.com/page1");
477        assert_eq!(first.lastmod.as_deref(), Some("2026-03-01"));
478        assert_eq!(first.changefreq.as_deref(), Some("daily"));
479        assert_eq!(first.priority, Some(0.8));
480
481        let second = entries.get(1).ok_or("missing second entry")?;
482        assert_eq!(second.loc, "https://example.com/page2");
483        assert_eq!(second.priority, Some(0.5));
484        assert!(second.changefreq.is_none());
485
486        let third = entries.get(2).ok_or("missing third entry")?;
487        assert_eq!(third.loc, "https://example.com/page3");
488        assert!(third.lastmod.is_none());
489        assert!(third.priority.is_none());
490
491        Ok(())
492    }
493
494    #[test]
495    fn parse_sitemapindex_extracts_nested_urls()
496    -> std::result::Result<(), Box<dyn std::error::Error>> {
497        let urls = parse_sitemapindex(SITEMAPINDEX_XML)?;
498        assert_eq!(urls.len(), 2);
499        assert_eq!(
500            urls.first().map(String::as_str),
501            Some("https://example.com/sitemap1.xml")
502        );
503        assert_eq!(
504            urls.get(1).map(String::as_str),
505            Some("https://example.com/sitemap2.xml.gz")
506        );
507        Ok(())
508    }
509
510    #[test]
511    fn detect_root_urlset() -> std::result::Result<(), Box<dyn std::error::Error>> {
512        let root = detect_root_element(URLSET_XML)?;
513        assert_eq!(root, RootElement::UrlSet);
514        Ok(())
515    }
516
517    #[test]
518    fn detect_root_sitemapindex() -> std::result::Result<(), Box<dyn std::error::Error>> {
519        let root = detect_root_element(SITEMAPINDEX_XML)?;
520        assert_eq!(root, RootElement::SitemapIndex);
521        Ok(())
522    }
523
524    #[test]
525    fn filter_by_lastmod_range() -> std::result::Result<(), Box<dyn std::error::Error>> {
526        let mut entries = parse_urlset(URLSET_XML)?;
527        // Only entries on or after 2026-03-01
528        entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= "2026-03-01"));
529        assert_eq!(entries.len(), 1);
530        assert_eq!(
531            entries.first().map(|entry| entry.loc.as_str()),
532            Some("https://example.com/page1")
533        );
534        Ok(())
535    }
536
537    #[test]
538    fn filter_by_priority_threshold() -> std::result::Result<(), Box<dyn std::error::Error>> {
539        let mut entries = parse_urlset(URLSET_XML)?;
540        entries.retain(|e| e.priority.unwrap_or(0.0) >= 0.6);
541        assert_eq!(entries.len(), 1);
542        assert_eq!(
543            entries.first().map(|entry| entry.loc.as_str()),
544            Some("https://example.com/page1")
545        );
546        Ok(())
547    }
548
549    #[test]
550    fn gzip_decompression() -> std::result::Result<(), Box<dyn std::error::Error>> {
551        use flate2::Compression;
552        use flate2::write::GzEncoder;
553        use std::io::Write;
554
555        let xml = URLSET_XML;
556        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
557        encoder.write_all(xml.as_bytes())?;
558        let compressed = encoder.finish()?;
559
560        // Decompress and parse
561        let mut decoder = GzDecoder::new(&compressed[..]);
562        let mut decompressed = String::new();
563        decoder.read_to_string(&mut decompressed)?;
564
565        let entries = parse_urlset(&decompressed)?;
566        assert_eq!(entries.len(), 3);
567        Ok(())
568    }
569
570    #[test]
571    fn malformed_xml_returns_error() {
572        let bad = "<not-a-sitemap><broken";
573        let result = detect_root_element(bad);
574        assert!(result.is_err());
575    }
576
577    #[test]
578    fn empty_xml_returns_error() {
579        let result = detect_root_element("");
580        assert!(result.is_err());
581    }
582
583    #[test]
584    fn unexpected_root_element_returns_error() {
585        let xml = r#"<?xml version="1.0"?><html><body>nope</body></html>"#;
586        let result = detect_root_element(xml);
587        assert!(result.is_err());
588    }
589
590    #[test]
591    fn urlset_with_no_urls_returns_empty() -> std::result::Result<(), Box<dyn std::error::Error>> {
592        let xml = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
593        let entries = parse_urlset(xml)?;
594        assert!(entries.is_empty());
595        Ok(())
596    }
597
598    #[test]
599    fn url_without_loc_is_skipped() -> std::result::Result<(), Box<dyn std::error::Error>> {
600        let xml = r#"<?xml version="1.0"?>
601<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
602  <url>
603    <lastmod>2026-01-01</lastmod>
604  </url>
605  <url>
606    <loc>https://example.com/valid</loc>
607  </url>
608</urlset>"#;
609        let entries = parse_urlset(xml)?;
610        assert_eq!(entries.len(), 1);
611        assert_eq!(
612            entries.first().map(|entry| entry.loc.as_str()),
613            Some("https://example.com/valid")
614        );
615        Ok(())
616    }
617}