1use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
32use async_trait::async_trait;
33use flate2::read::GzDecoder;
34use quick_xml::Reader;
35use quick_xml::events::Event;
36use serde::{Deserialize, Serialize};
37use serde_json::json;
38use std::io::Read;
39
40#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub struct SitemapEntry {
58 pub loc: String,
60 pub lastmod: Option<String>,
62 pub changefreq: Option<String>,
64 pub priority: Option<f64>,
66}
67
68pub struct SitemapAdapter {
83 client: reqwest::Client,
84 max_depth: usize,
85}
86
87impl SitemapAdapter {
88 pub const fn new(client: reqwest::Client, max_depth: usize) -> Self {
100 Self { client, max_depth }
101 }
102
103 async fn fetch_bytes(&self, url: &str) -> Result<String> {
109 let resp = self.client.get(url).send().await.map_err(|e| {
110 StygianError::Service(ServiceError::Unavailable(format!(
111 "sitemap fetch failed: {e}"
112 )))
113 })?;
114
115 if !resp.status().is_success() {
116 return Err(StygianError::Service(ServiceError::InvalidResponse(
117 format!("sitemap returned HTTP {}", resp.status()),
118 )));
119 }
120
121 let bytes = resp.bytes().await.map_err(|e| {
122 StygianError::Service(ServiceError::Unavailable(format!(
123 "sitemap body read failed: {e}"
124 )))
125 })?;
126
127 if url.to_ascii_lowercase().ends_with(".gz") || bytes.starts_with(&[0x1f, 0x8b]) {
129 let mut decoder = GzDecoder::new(&bytes[..]);
130 let mut xml = String::new();
131 decoder.read_to_string(&mut xml).map_err(|e| {
132 StygianError::Service(ServiceError::InvalidResponse(format!(
133 "gzip decompression failed: {e}"
134 )))
135 })?;
136 Ok(xml)
137 } else {
138 String::from_utf8(bytes.to_vec()).map_err(|e| {
139 StygianError::Service(ServiceError::InvalidResponse(format!(
140 "sitemap not valid UTF-8: {e}"
141 )))
142 })
143 }
144 }
145
146 async fn resolve(&self, url: &str, depth: usize) -> Result<Vec<SitemapEntry>> {
152 if depth > self.max_depth {
153 return Err(StygianError::Service(ServiceError::InvalidResponse(
154 format!(
155 "sitemap index nesting exceeded max depth ({depth} > {})",
156 self.max_depth
157 ),
158 )));
159 }
160
161 let xml = self.fetch_bytes(url).await?;
162 let root_kind = detect_root_element(&xml)?;
163
164 match root_kind {
165 RootElement::UrlSet => parse_urlset(&xml),
166 RootElement::SitemapIndex => {
167 let nested_urls = parse_sitemapindex(&xml)?;
168 let mut all = Vec::new();
169 for nested_url in &nested_urls {
170 let entries = Box::pin(self.resolve(nested_url, depth + 1)).await?;
171 all.extend(entries);
172 }
173 Ok(all)
174 }
175 }
176 }
177}
178
179#[async_trait]
180impl ScrapingService for SitemapAdapter {
181 async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
205 let mut entries = self.resolve(&input.url, 0).await?;
206
207 if let Some(min_pri) = input
209 .params
210 .get("min_priority")
211 .and_then(serde_json::Value::as_f64)
212 {
213 entries.retain(|e| e.priority.unwrap_or(0.0) >= min_pri);
214 }
215 if let Some(after) = input.params.get("lastmod_after").and_then(|v| v.as_str()) {
216 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= after));
217 }
218 if let Some(before) = input.params.get("lastmod_before").and_then(|v| v.as_str()) {
219 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm <= before));
220 }
221
222 let count = entries.len();
223 let data = serde_json::to_string(&entries).map_err(|e| {
224 StygianError::Service(ServiceError::InvalidResponse(format!(
225 "sitemap serialization failed: {e}"
226 )))
227 })?;
228
229 Ok(ServiceOutput {
230 data,
231 metadata: json!({
232 "source": "sitemap",
233 "url_count": count,
234 "source_url": input.url,
235 }),
236 })
237 }
238
239 fn name(&self) -> &'static str {
240 "sitemap"
241 }
242}
243
244#[derive(Debug, PartialEq)]
247enum RootElement {
248 UrlSet,
249 SitemapIndex,
250}
251
252fn detect_root_element(xml: &str) -> Result<RootElement> {
254 let mut reader = Reader::from_str(xml);
255 let mut buf = Vec::new();
256
257 loop {
258 match reader.read_event_into(&mut buf) {
259 Ok(Event::Start(ref e) | Event::Empty(ref e)) => {
260 let local = e.local_name();
261 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
262 return match name {
263 "urlset" => Ok(RootElement::UrlSet),
264 "sitemapindex" => Ok(RootElement::SitemapIndex),
265 _ => Err(StygianError::Service(ServiceError::InvalidResponse(
266 format!("unexpected XML root element: <{name}>"),
267 ))),
268 };
269 }
270 Ok(Event::Eof) => {
271 return Err(StygianError::Service(ServiceError::InvalidResponse(
272 "empty or invalid XML document".into(),
273 )));
274 }
275 Err(e) => {
276 return Err(StygianError::Service(ServiceError::InvalidResponse(
277 format!("XML parse error: {e}"),
278 )));
279 }
280 _ => {} }
282 buf.clear();
283 }
284}
285
286fn parse_urlset(xml: &str) -> Result<Vec<SitemapEntry>> {
288 let mut reader = Reader::from_str(xml);
289 let mut buf = Vec::new();
290 let mut entries = Vec::new();
291
292 let mut current: Option<SitemapEntryBuilder> = None;
294 let mut current_tag: Option<String> = None;
295
296 loop {
297 match reader.read_event_into(&mut buf) {
298 Ok(Event::Start(ref e)) => {
299 let name = local_name(e);
300 match name.as_str() {
301 "url" => {
302 current = Some(SitemapEntryBuilder::default());
303 }
304 "loc" | "lastmod" | "changefreq" | "priority" => {
305 current_tag = Some(name);
306 }
307 _ => {}
308 }
309 }
310 Ok(Event::Text(ref t)) => {
311 if let (Some(builder), Some(tag)) = (&mut current, ¤t_tag) {
312 let text = t.unescape().unwrap_or_default().trim().to_string();
313 if !text.is_empty() {
314 match tag.as_str() {
315 "loc" => builder.loc = Some(text),
316 "lastmod" => builder.lastmod = Some(text),
317 "changefreq" => builder.changefreq = Some(text),
318 "priority" => builder.priority = text.parse().ok(),
319 _ => {}
320 }
321 }
322 }
323 }
324 Ok(Event::End(ref e)) => {
325 let name = local_name_end(e);
326 if name == "url"
327 && let Some(builder) = current.take()
328 && let Some(entry) = builder.build()
329 {
330 entries.push(entry);
331 }
332 if current_tag.as_deref() == Some(&name) {
333 current_tag = None;
334 }
335 }
336 Ok(Event::Eof) => break,
337 Err(e) => {
338 return Err(StygianError::Service(ServiceError::InvalidResponse(
339 format!("sitemap XML parse error: {e}"),
340 )));
341 }
342 _ => {}
343 }
344 buf.clear();
345 }
346
347 Ok(entries)
348}
349
350fn parse_sitemapindex(xml: &str) -> Result<Vec<String>> {
352 let mut reader = Reader::from_str(xml);
353 let mut buf = Vec::new();
354 let mut urls = Vec::new();
355 let mut in_sitemap = false;
356 let mut in_loc = false;
357
358 loop {
359 match reader.read_event_into(&mut buf) {
360 Ok(Event::Start(ref e)) => {
361 let name = local_name(e);
362 match name.as_str() {
363 "sitemap" => in_sitemap = true,
364 "loc" if in_sitemap => in_loc = true,
365 _ => {}
366 }
367 }
368 Ok(Event::Text(ref t)) if in_loc => {
369 let text = t.unescape().unwrap_or_default().trim().to_string();
370 if !text.is_empty() {
371 urls.push(text);
372 }
373 }
374 Ok(Event::End(ref e)) => {
375 let name = local_name_end(e);
376 match name.as_str() {
377 "sitemap" => {
378 in_sitemap = false;
379 in_loc = false;
380 }
381 "loc" => in_loc = false,
382 _ => {}
383 }
384 }
385 Ok(Event::Eof) => break,
386 Err(e) => {
387 return Err(StygianError::Service(ServiceError::InvalidResponse(
388 format!("sitemapindex XML parse error: {e}"),
389 )));
390 }
391 _ => {}
392 }
393 buf.clear();
394 }
395
396 Ok(urls)
397}
398
399fn local_name(e: &quick_xml::events::BytesStart<'_>) -> String {
401 std::str::from_utf8(e.local_name().as_ref())
402 .unwrap_or("")
403 .to_string()
404}
405
406fn local_name_end(e: &quick_xml::events::BytesEnd<'_>) -> String {
408 std::str::from_utf8(e.local_name().as_ref())
409 .unwrap_or("")
410 .to_string()
411}
412
413#[derive(Default)]
416struct SitemapEntryBuilder {
417 loc: Option<String>,
418 lastmod: Option<String>,
419 changefreq: Option<String>,
420 priority: Option<f64>,
421}
422
423impl SitemapEntryBuilder {
424 fn build(self) -> Option<SitemapEntry> {
425 Some(SitemapEntry {
426 loc: self.loc?,
427 lastmod: self.lastmod,
428 changefreq: self.changefreq,
429 priority: self.priority,
430 })
431 }
432}
433
434#[cfg(test)]
437mod tests {
438 use super::*;
439
440 const URLSET_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
441<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
442 <url>
443 <loc>https://example.com/page1</loc>
444 <lastmod>2026-03-01</lastmod>
445 <changefreq>daily</changefreq>
446 <priority>0.8</priority>
447 </url>
448 <url>
449 <loc>https://example.com/page2</loc>
450 <lastmod>2026-02-15</lastmod>
451 <priority>0.5</priority>
452 </url>
453 <url>
454 <loc>https://example.com/page3</loc>
455 </url>
456</urlset>"#;
457
458 const SITEMAPINDEX_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
459<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
460 <sitemap>
461 <loc>https://example.com/sitemap1.xml</loc>
462 <lastmod>2026-03-01</lastmod>
463 </sitemap>
464 <sitemap>
465 <loc>https://example.com/sitemap2.xml.gz</loc>
466 </sitemap>
467</sitemapindex>"#;
468
469 #[test]
470 fn parse_urlset_with_3_urls() -> std::result::Result<(), Box<dyn std::error::Error>> {
471 let entries = parse_urlset(URLSET_XML)?;
472 assert_eq!(entries.len(), 3);
473
474 let first = entries.first().ok_or("missing first entry")?;
475 assert_eq!(first.loc, "https://example.com/page1");
476 assert_eq!(first.lastmod.as_deref(), Some("2026-03-01"));
477 assert_eq!(first.changefreq.as_deref(), Some("daily"));
478 assert_eq!(first.priority, Some(0.8));
479
480 let second = entries.get(1).ok_or("missing second entry")?;
481 assert_eq!(second.loc, "https://example.com/page2");
482 assert_eq!(second.priority, Some(0.5));
483 assert!(second.changefreq.is_none());
484
485 let third = entries.get(2).ok_or("missing third entry")?;
486 assert_eq!(third.loc, "https://example.com/page3");
487 assert!(third.lastmod.is_none());
488 assert!(third.priority.is_none());
489
490 Ok(())
491 }
492
493 #[test]
494 fn parse_sitemapindex_extracts_nested_urls()
495 -> std::result::Result<(), Box<dyn std::error::Error>> {
496 let urls = parse_sitemapindex(SITEMAPINDEX_XML)?;
497 assert_eq!(urls.len(), 2);
498 assert_eq!(
499 urls.first().map(String::as_str),
500 Some("https://example.com/sitemap1.xml")
501 );
502 assert_eq!(
503 urls.get(1).map(String::as_str),
504 Some("https://example.com/sitemap2.xml.gz")
505 );
506 Ok(())
507 }
508
509 #[test]
510 fn detect_root_urlset() -> std::result::Result<(), Box<dyn std::error::Error>> {
511 let root = detect_root_element(URLSET_XML)?;
512 assert_eq!(root, RootElement::UrlSet);
513 Ok(())
514 }
515
516 #[test]
517 fn detect_root_sitemapindex() -> std::result::Result<(), Box<dyn std::error::Error>> {
518 let root = detect_root_element(SITEMAPINDEX_XML)?;
519 assert_eq!(root, RootElement::SitemapIndex);
520 Ok(())
521 }
522
523 #[test]
524 fn filter_by_lastmod_range() -> std::result::Result<(), Box<dyn std::error::Error>> {
525 let mut entries = parse_urlset(URLSET_XML)?;
526 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= "2026-03-01"));
528 assert_eq!(entries.len(), 1);
529 assert_eq!(
530 entries.first().map(|entry| entry.loc.as_str()),
531 Some("https://example.com/page1")
532 );
533 Ok(())
534 }
535
536 #[test]
537 fn filter_by_priority_threshold() -> std::result::Result<(), Box<dyn std::error::Error>> {
538 let mut entries = parse_urlset(URLSET_XML)?;
539 entries.retain(|e| e.priority.unwrap_or(0.0) >= 0.6);
540 assert_eq!(entries.len(), 1);
541 assert_eq!(
542 entries.first().map(|entry| entry.loc.as_str()),
543 Some("https://example.com/page1")
544 );
545 Ok(())
546 }
547
548 #[test]
549 fn gzip_decompression() -> std::result::Result<(), Box<dyn std::error::Error>> {
550 use flate2::Compression;
551 use flate2::write::GzEncoder;
552 use std::io::Write;
553
554 let xml = URLSET_XML;
555 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
556 encoder.write_all(xml.as_bytes())?;
557 let compressed = encoder.finish()?;
558
559 let mut decoder = GzDecoder::new(&compressed[..]);
561 let mut decompressed = String::new();
562 decoder.read_to_string(&mut decompressed)?;
563
564 let entries = parse_urlset(&decompressed)?;
565 assert_eq!(entries.len(), 3);
566 Ok(())
567 }
568
569 #[test]
570 fn malformed_xml_returns_error() {
571 let bad = "<not-a-sitemap><broken";
572 let result = detect_root_element(bad);
573 assert!(result.is_err());
574 }
575
576 #[test]
577 fn empty_xml_returns_error() {
578 let result = detect_root_element("");
579 assert!(result.is_err());
580 }
581
582 #[test]
583 fn unexpected_root_element_returns_error() {
584 let xml = r#"<?xml version="1.0"?><html><body>nope</body></html>"#;
585 let result = detect_root_element(xml);
586 assert!(result.is_err());
587 }
588
589 #[test]
590 fn urlset_with_no_urls_returns_empty() -> std::result::Result<(), Box<dyn std::error::Error>> {
591 let xml = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
592 let entries = parse_urlset(xml)?;
593 assert!(entries.is_empty());
594 Ok(())
595 }
596
597 #[test]
598 fn url_without_loc_is_skipped() -> std::result::Result<(), Box<dyn std::error::Error>> {
599 let xml = r#"<?xml version="1.0"?>
600<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
601 <url>
602 <lastmod>2026-01-01</lastmod>
603 </url>
604 <url>
605 <loc>https://example.com/valid</loc>
606 </url>
607</urlset>"#;
608 let entries = parse_urlset(xml)?;
609 assert_eq!(entries.len(), 1);
610 assert_eq!(
611 entries.first().map(|entry| entry.loc.as_str()),
612 Some("https://example.com/valid")
613 );
614 Ok(())
615 }
616}