Skip to main content

stygian_graph/adapters/
sitemap.rs

1//! Sitemap / sitemap-index [`ScrapingService`](crate::ports::ScrapingService) adapter
2//!
3//! Parses XML sitemaps (`<urlset>`) and sitemap index files (`<sitemapindex>`),
4//! emitting discovered URLs with metadata for downstream pipeline nodes.
5//!
6//! Supports:
7//! - Standard sitemaps (`<urlset>` with `<url>` entries)
8//! - Sitemap index files (`<sitemapindex>` with nested `<sitemap>` refs)
9//! - Gzipped sitemaps (`.xml.gz`) via `flate2`
10//! - Filtering by `lastmod` date range or `priority` threshold
11//!
12//! # Example
13//!
14//! ```no_run
15//! use stygian_graph::adapters::sitemap::SitemapAdapter;
16//! use stygian_graph::ports::{ScrapingService, ServiceInput};
17//! use serde_json::json;
18//!
19//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
20//! let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
21//! let input = ServiceInput {
22//!     url: "https://example.com/sitemap.xml".into(),
23//!     params: json!({}),
24//! };
25//! let output = adapter.execute(input).await.unwrap();
26//! println!("{}", output.data); // JSON array of discovered URLs
27//! # });
28//! ```
29
30use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
32use async_trait::async_trait;
33use flate2::read::GzDecoder;
34use quick_xml::Reader;
35use quick_xml::events::Event;
36use serde::{Deserialize, Serialize};
37use serde_json::json;
38use std::io::Read;
39
40// ─── Domain types ─────────────────────────────────────────────────────────────
41
42/// A single URL entry extracted from a sitemap.
43///
44/// # Example
45///
46/// ```
47/// use stygian_graph::adapters::sitemap::SitemapEntry;
48///
49/// let entry = SitemapEntry {
50///     loc: "https://example.com/page".into(),
51///     lastmod: Some("2026-03-01".into()),
52///     changefreq: Some("weekly".into()),
53///     priority: Some(0.8),
54/// };
55/// ```
56#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub struct SitemapEntry {
58    /// Absolute URL.
59    pub loc: String,
60    /// Last-modified date string (ISO 8601).
61    pub lastmod: Option<String>,
62    /// Change frequency hint.
63    pub changefreq: Option<String>,
64    /// Priority (0.0–1.0).
65    pub priority: Option<f64>,
66}
67
68// ─── Adapter ──────────────────────────────────────────────────────────────────
69
70/// Sitemap / sitemap-index source adapter.
71///
72/// Fetches and parses XML sitemaps, recursively resolving sitemap index files
73/// up to a configurable depth limit.
74///
75/// # Example
76///
77/// ```no_run
78/// use stygian_graph::adapters::sitemap::SitemapAdapter;
79///
80/// let adapter = SitemapAdapter::new(reqwest::Client::new(), 3);
81/// ```
82pub struct SitemapAdapter {
83    client: reqwest::Client,
84    max_depth: usize,
85}
86
87impl SitemapAdapter {
88    /// Create a new sitemap adapter.
89    ///
90    /// `max_depth` controls how many levels of sitemap-index nesting to follow.
91    ///
92    /// # Example
93    ///
94    /// ```
95    /// use stygian_graph::adapters::sitemap::SitemapAdapter;
96    ///
97    /// let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
98    /// ```
99    pub const fn new(client: reqwest::Client, max_depth: usize) -> Self {
100        Self { client, max_depth }
101    }
102
103    /// Fetch raw bytes from a URL, transparently decompressing `.xml.gz`.
104    ///
105    /// # Errors
106    ///
107    /// Returns [`StygianError::Service`] on HTTP or decompression failure.
108    async fn fetch_bytes(&self, url: &str) -> Result<String> {
109        let resp = self.client.get(url).send().await.map_err(|e| {
110            StygianError::Service(ServiceError::Unavailable(format!(
111                "sitemap fetch failed: {e}"
112            )))
113        })?;
114
115        if !resp.status().is_success() {
116            return Err(StygianError::Service(ServiceError::InvalidResponse(
117                format!("sitemap returned HTTP {}", resp.status()),
118            )));
119        }
120
121        let bytes = resp.bytes().await.map_err(|e| {
122            StygianError::Service(ServiceError::Unavailable(format!(
123                "sitemap body read failed: {e}"
124            )))
125        })?;
126
127        // Attempt gzip decompression if URL ends in .gz or content looks gzipped
128        if url.to_ascii_lowercase().ends_with(".gz") || bytes.starts_with(&[0x1f, 0x8b]) {
129            let mut decoder = GzDecoder::new(&bytes[..]);
130            let mut xml = String::new();
131            decoder.read_to_string(&mut xml).map_err(|e| {
132                StygianError::Service(ServiceError::InvalidResponse(format!(
133                    "gzip decompression failed: {e}"
134                )))
135            })?;
136            Ok(xml)
137        } else {
138            String::from_utf8(bytes.to_vec()).map_err(|e| {
139                StygianError::Service(ServiceError::InvalidResponse(format!(
140                    "sitemap not valid UTF-8: {e}"
141                )))
142            })
143        }
144    }
145
146    /// Recursively resolve a sitemap URL, returning all discovered entries.
147    ///
148    /// # Errors
149    ///
150    /// Returns [`StygianError::Service`] on fetch, parse, or depth-limit errors.
151    async fn resolve(&self, url: &str, depth: usize) -> Result<Vec<SitemapEntry>> {
152        if depth > self.max_depth {
153            return Err(StygianError::Service(ServiceError::InvalidResponse(
154                format!(
155                    "sitemap index nesting exceeded max depth ({depth} > {})",
156                    self.max_depth
157                ),
158            )));
159        }
160
161        let xml = self.fetch_bytes(url).await?;
162        let root_kind = detect_root_element(&xml)?;
163
164        match root_kind {
165            RootElement::UrlSet => parse_urlset(&xml),
166            RootElement::SitemapIndex => {
167                let nested_urls = parse_sitemapindex(&xml)?;
168                let mut all = Vec::new();
169                for nested_url in &nested_urls {
170                    let entries = Box::pin(self.resolve(nested_url, depth + 1)).await?;
171                    all.extend(entries);
172                }
173                Ok(all)
174            }
175        }
176    }
177}
178
179#[async_trait]
180impl ScrapingService for SitemapAdapter {
181    /// Fetch and parse a sitemap, returning discovered URLs as JSON.
182    ///
183    /// # Params (optional)
184    ///
185    /// * `min_priority` — f64, filter entries with priority >= this value.
186    /// * `lastmod_after` — string, include only entries with lastmod >= this date.
187    /// * `lastmod_before` — string, include only entries with lastmod <= this date.
188    ///
189    /// # Example
190    ///
191    /// ```no_run
192    /// # use stygian_graph::adapters::sitemap::SitemapAdapter;
193    /// # use stygian_graph::ports::{ScrapingService, ServiceInput};
194    /// # use serde_json::json;
195    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
196    /// let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
197    /// let input = ServiceInput {
198    ///     url: "https://example.com/sitemap.xml".into(),
199    ///     params: json!({ "min_priority": 0.5 }),
200    /// };
201    /// let out = adapter.execute(input).await.unwrap();
202    /// # });
203    /// ```
204    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
205        let mut entries = self.resolve(&input.url, 0).await?;
206
207        // Apply optional filters
208        if let Some(min_pri) = input
209            .params
210            .get("min_priority")
211            .and_then(serde_json::Value::as_f64)
212        {
213            entries.retain(|e| e.priority.unwrap_or(0.0) >= min_pri);
214        }
215        if let Some(after) = input.params.get("lastmod_after").and_then(|v| v.as_str()) {
216            entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= after));
217        }
218        if let Some(before) = input.params.get("lastmod_before").and_then(|v| v.as_str()) {
219            entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm <= before));
220        }
221
222        let count = entries.len();
223        let data = serde_json::to_string(&entries).map_err(|e| {
224            StygianError::Service(ServiceError::InvalidResponse(format!(
225                "sitemap serialization failed: {e}"
226            )))
227        })?;
228
229        Ok(ServiceOutput {
230            data,
231            metadata: json!({
232                "source": "sitemap",
233                "url_count": count,
234                "source_url": input.url,
235            }),
236        })
237    }
238
239    fn name(&self) -> &'static str {
240        "sitemap"
241    }
242}
243
244// ─── XML parsing helpers ──────────────────────────────────────────────────────
245
246#[derive(Debug, PartialEq)]
247enum RootElement {
248    UrlSet,
249    SitemapIndex,
250}
251
252/// Detect whether the XML document is a `<urlset>` or `<sitemapindex>`.
253fn detect_root_element(xml: &str) -> Result<RootElement> {
254    let mut reader = Reader::from_str(xml);
255    let mut buf = Vec::new();
256
257    loop {
258        match reader.read_event_into(&mut buf) {
259            Ok(Event::Start(ref e) | Event::Empty(ref e)) => {
260                let local = e.local_name();
261                let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
262                return match name {
263                    "urlset" => Ok(RootElement::UrlSet),
264                    "sitemapindex" => Ok(RootElement::SitemapIndex),
265                    _ => Err(StygianError::Service(ServiceError::InvalidResponse(
266                        format!("unexpected XML root element: <{name}>"),
267                    ))),
268                };
269            }
270            Ok(Event::Eof) => {
271                return Err(StygianError::Service(ServiceError::InvalidResponse(
272                    "empty or invalid XML document".into(),
273                )));
274            }
275            Err(e) => {
276                return Err(StygianError::Service(ServiceError::InvalidResponse(
277                    format!("XML parse error: {e}"),
278                )));
279            }
280            _ => {} // skip processing instructions, comments, decl
281        }
282        buf.clear();
283    }
284}
285
286/// Parse a `<urlset>` document into a list of [`SitemapEntry`].
287fn parse_urlset(xml: &str) -> Result<Vec<SitemapEntry>> {
288    let mut reader = Reader::from_str(xml);
289    let mut buf = Vec::new();
290    let mut entries = Vec::new();
291
292    // Current entry being built
293    let mut current: Option<SitemapEntryBuilder> = None;
294    let mut current_tag: Option<String> = None;
295
296    loop {
297        match reader.read_event_into(&mut buf) {
298            Ok(Event::Start(ref e)) => {
299                let name = local_name(e);
300                match name.as_str() {
301                    "url" => {
302                        current = Some(SitemapEntryBuilder::default());
303                    }
304                    "loc" | "lastmod" | "changefreq" | "priority" => {
305                        current_tag = Some(name);
306                    }
307                    _ => {}
308                }
309            }
310            Ok(Event::Text(ref t)) => {
311                if let (Some(builder), Some(tag)) = (&mut current, &current_tag) {
312                    let text = t.unescape().unwrap_or_default().trim().to_string();
313                    if !text.is_empty() {
314                        match tag.as_str() {
315                            "loc" => builder.loc = Some(text),
316                            "lastmod" => builder.lastmod = Some(text),
317                            "changefreq" => builder.changefreq = Some(text),
318                            "priority" => builder.priority = text.parse().ok(),
319                            _ => {}
320                        }
321                    }
322                }
323            }
324            Ok(Event::End(ref e)) => {
325                let name = local_name_end(e);
326                if name == "url"
327                    && let Some(builder) = current.take()
328                    && let Some(entry) = builder.build()
329                {
330                    entries.push(entry);
331                }
332                if current_tag.as_deref() == Some(&name) {
333                    current_tag = None;
334                }
335            }
336            Ok(Event::Eof) => break,
337            Err(e) => {
338                return Err(StygianError::Service(ServiceError::InvalidResponse(
339                    format!("sitemap XML parse error: {e}"),
340                )));
341            }
342            _ => {}
343        }
344        buf.clear();
345    }
346
347    Ok(entries)
348}
349
350/// Parse a `<sitemapindex>` document, returning the `<loc>` URLs of nested sitemaps.
351fn parse_sitemapindex(xml: &str) -> Result<Vec<String>> {
352    let mut reader = Reader::from_str(xml);
353    let mut buf = Vec::new();
354    let mut urls = Vec::new();
355    let mut in_sitemap = false;
356    let mut in_loc = false;
357
358    loop {
359        match reader.read_event_into(&mut buf) {
360            Ok(Event::Start(ref e)) => {
361                let name = local_name(e);
362                match name.as_str() {
363                    "sitemap" => in_sitemap = true,
364                    "loc" if in_sitemap => in_loc = true,
365                    _ => {}
366                }
367            }
368            Ok(Event::Text(ref t)) if in_loc => {
369                let text = t.unescape().unwrap_or_default().trim().to_string();
370                if !text.is_empty() {
371                    urls.push(text);
372                }
373            }
374            Ok(Event::End(ref e)) => {
375                let name = local_name_end(e);
376                match name.as_str() {
377                    "sitemap" => {
378                        in_sitemap = false;
379                        in_loc = false;
380                    }
381                    "loc" => in_loc = false,
382                    _ => {}
383                }
384            }
385            Ok(Event::Eof) => break,
386            Err(e) => {
387                return Err(StygianError::Service(ServiceError::InvalidResponse(
388                    format!("sitemapindex XML parse error: {e}"),
389                )));
390            }
391            _ => {}
392        }
393        buf.clear();
394    }
395
396    Ok(urls)
397}
398
399/// Extract the local name (without namespace prefix) from a start element.
400fn local_name(e: &quick_xml::events::BytesStart<'_>) -> String {
401    std::str::from_utf8(e.local_name().as_ref())
402        .unwrap_or("")
403        .to_string()
404}
405
406/// Extract the local name from an end element.
407fn local_name_end(e: &quick_xml::events::BytesEnd<'_>) -> String {
408    std::str::from_utf8(e.local_name().as_ref())
409        .unwrap_or("")
410        .to_string()
411}
412
413// ─── Builder ──────────────────────────────────────────────────────────────────
414
415#[derive(Default)]
416struct SitemapEntryBuilder {
417    loc: Option<String>,
418    lastmod: Option<String>,
419    changefreq: Option<String>,
420    priority: Option<f64>,
421}
422
423impl SitemapEntryBuilder {
424    fn build(self) -> Option<SitemapEntry> {
425        Some(SitemapEntry {
426            loc: self.loc?,
427            lastmod: self.lastmod,
428            changefreq: self.changefreq,
429            priority: self.priority,
430        })
431    }
432}
433
434// ─── Tests ────────────────────────────────────────────────────────────────────
435
436#[cfg(test)]
437mod tests {
438    use super::*;
439
440    const URLSET_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
441<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
442  <url>
443    <loc>https://example.com/page1</loc>
444    <lastmod>2026-03-01</lastmod>
445    <changefreq>daily</changefreq>
446    <priority>0.8</priority>
447  </url>
448  <url>
449    <loc>https://example.com/page2</loc>
450    <lastmod>2026-02-15</lastmod>
451    <priority>0.5</priority>
452  </url>
453  <url>
454    <loc>https://example.com/page3</loc>
455  </url>
456</urlset>"#;
457
458    const SITEMAPINDEX_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
459<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
460  <sitemap>
461    <loc>https://example.com/sitemap1.xml</loc>
462    <lastmod>2026-03-01</lastmod>
463  </sitemap>
464  <sitemap>
465    <loc>https://example.com/sitemap2.xml.gz</loc>
466  </sitemap>
467</sitemapindex>"#;
468
469    #[test]
470    fn parse_urlset_with_3_urls() -> std::result::Result<(), Box<dyn std::error::Error>> {
471        let entries = parse_urlset(URLSET_XML)?;
472        assert_eq!(entries.len(), 3);
473
474        let first = entries.first().ok_or("missing first entry")?;
475        assert_eq!(first.loc, "https://example.com/page1");
476        assert_eq!(first.lastmod.as_deref(), Some("2026-03-01"));
477        assert_eq!(first.changefreq.as_deref(), Some("daily"));
478        assert_eq!(first.priority, Some(0.8));
479
480        let second = entries.get(1).ok_or("missing second entry")?;
481        assert_eq!(second.loc, "https://example.com/page2");
482        assert_eq!(second.priority, Some(0.5));
483        assert!(second.changefreq.is_none());
484
485        let third = entries.get(2).ok_or("missing third entry")?;
486        assert_eq!(third.loc, "https://example.com/page3");
487        assert!(third.lastmod.is_none());
488        assert!(third.priority.is_none());
489
490        Ok(())
491    }
492
493    #[test]
494    fn parse_sitemapindex_extracts_nested_urls()
495    -> std::result::Result<(), Box<dyn std::error::Error>> {
496        let urls = parse_sitemapindex(SITEMAPINDEX_XML)?;
497        assert_eq!(urls.len(), 2);
498        assert_eq!(
499            urls.first().map(String::as_str),
500            Some("https://example.com/sitemap1.xml")
501        );
502        assert_eq!(
503            urls.get(1).map(String::as_str),
504            Some("https://example.com/sitemap2.xml.gz")
505        );
506        Ok(())
507    }
508
509    #[test]
510    fn detect_root_urlset() -> std::result::Result<(), Box<dyn std::error::Error>> {
511        let root = detect_root_element(URLSET_XML)?;
512        assert_eq!(root, RootElement::UrlSet);
513        Ok(())
514    }
515
516    #[test]
517    fn detect_root_sitemapindex() -> std::result::Result<(), Box<dyn std::error::Error>> {
518        let root = detect_root_element(SITEMAPINDEX_XML)?;
519        assert_eq!(root, RootElement::SitemapIndex);
520        Ok(())
521    }
522
523    #[test]
524    fn filter_by_lastmod_range() -> std::result::Result<(), Box<dyn std::error::Error>> {
525        let mut entries = parse_urlset(URLSET_XML)?;
526        // Only entries on or after 2026-03-01
527        entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= "2026-03-01"));
528        assert_eq!(entries.len(), 1);
529        assert_eq!(
530            entries.first().map(|entry| entry.loc.as_str()),
531            Some("https://example.com/page1")
532        );
533        Ok(())
534    }
535
536    #[test]
537    fn filter_by_priority_threshold() -> std::result::Result<(), Box<dyn std::error::Error>> {
538        let mut entries = parse_urlset(URLSET_XML)?;
539        entries.retain(|e| e.priority.unwrap_or(0.0) >= 0.6);
540        assert_eq!(entries.len(), 1);
541        assert_eq!(
542            entries.first().map(|entry| entry.loc.as_str()),
543            Some("https://example.com/page1")
544        );
545        Ok(())
546    }
547
548    #[test]
549    fn gzip_decompression() -> std::result::Result<(), Box<dyn std::error::Error>> {
550        use flate2::Compression;
551        use flate2::write::GzEncoder;
552        use std::io::Write;
553
554        let xml = URLSET_XML;
555        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
556        encoder.write_all(xml.as_bytes())?;
557        let compressed = encoder.finish()?;
558
559        // Decompress and parse
560        let mut decoder = GzDecoder::new(&compressed[..]);
561        let mut decompressed = String::new();
562        decoder.read_to_string(&mut decompressed)?;
563
564        let entries = parse_urlset(&decompressed)?;
565        assert_eq!(entries.len(), 3);
566        Ok(())
567    }
568
569    #[test]
570    fn malformed_xml_returns_error() {
571        let bad = "<not-a-sitemap><broken";
572        let result = detect_root_element(bad);
573        assert!(result.is_err());
574    }
575
576    #[test]
577    fn empty_xml_returns_error() {
578        let result = detect_root_element("");
579        assert!(result.is_err());
580    }
581
582    #[test]
583    fn unexpected_root_element_returns_error() {
584        let xml = r#"<?xml version="1.0"?><html><body>nope</body></html>"#;
585        let result = detect_root_element(xml);
586        assert!(result.is_err());
587    }
588
589    #[test]
590    fn urlset_with_no_urls_returns_empty() -> std::result::Result<(), Box<dyn std::error::Error>> {
591        let xml = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
592        let entries = parse_urlset(xml)?;
593        assert!(entries.is_empty());
594        Ok(())
595    }
596
597    #[test]
598    fn url_without_loc_is_skipped() -> std::result::Result<(), Box<dyn std::error::Error>> {
599        let xml = r#"<?xml version="1.0"?>
600<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
601  <url>
602    <lastmod>2026-01-01</lastmod>
603  </url>
604  <url>
605    <loc>https://example.com/valid</loc>
606  </url>
607</urlset>"#;
608        let entries = parse_urlset(xml)?;
609        assert_eq!(entries.len(), 1);
610        assert_eq!(
611            entries.first().map(|entry| entry.loc.as_str()),
612            Some("https://example.com/valid")
613        );
614        Ok(())
615    }
616}