stygian_graph/adapters/
http.rs

1//! HTTP scraping adapter with anti-bot features
2//!
3//! Implements the `ScrapingService` port using reqwest with:
4//! - Realistic browser headers and User-Agent rotation
5//! - Cookie jar persistence across requests in a session
6//! - Exponential backoff retry (up to 3 attempts)
7//! - Configurable timeouts
8//! - Optional proxy support
9//!
10//! # Example
11//!
12//! ```no_run
13//! use stygian_graph::adapters::http::{HttpAdapter, HttpConfig};
14//! use stygian_graph::ports::{ScrapingService, ServiceInput};
15//! use serde_json::json;
16//!
17//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
18//! let adapter = HttpAdapter::with_config(HttpConfig::default());
19//! let input = ServiceInput {
20//!     url: "https://httpbin.org/get".to_string(),
21//!     params: json!({}),
22//! };
23//! // let result = adapter.execute(input).await.unwrap();
24//! # });
25//! ```
26
27use std::time::Duration;
28
29use async_trait::async_trait;
30use reqwest::{Client, Proxy, header};
31
32use crate::domain::error::{Result, ServiceError, StygianError};
33use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
34
35/// Rotating pool of realistic browser User-Agent strings
36static USER_AGENTS: &[&str] = &[
37    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
38    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
39    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
40    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
41    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:133.0) Gecko/20100101 Firefox/133.0",
42    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
43];
44
45/// Configuration for the HTTP adapter
46#[derive(Debug, Clone)]
47pub struct HttpConfig {
48    /// Request timeout (default: 30 seconds)
49    pub timeout: Duration,
50    /// Number of retry attempts on transient failures (default: 3)
51    pub max_retries: u32,
52    /// Base delay for exponential backoff (default: 1 second)
53    pub retry_base_delay: Duration,
54    /// Optional HTTP/SOCKS5 proxy URL
55    pub proxy_url: Option<String>,
56    /// Whether to rotate User-Agent header on each request
57    pub rotate_user_agent: bool,
58    /// Index into `USER_AGENTS` for round-robin rotation (wraps)
59    pub(crate) ua_counter: std::sync::Arc<std::sync::atomic::AtomicUsize>,
60}
61
62impl Default for HttpConfig {
63    fn default() -> Self {
64        Self {
65            timeout: Duration::from_secs(30),
66            max_retries: 3,
67            retry_base_delay: Duration::from_secs(1),
68            proxy_url: None,
69            rotate_user_agent: true,
70            ua_counter: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
71        }
72    }
73}
74
75/// HTTP client adapter with anti-bot features.
76///
77/// Thread-safe and cheaply cloneable — the internal `reqwest::Client` uses
78/// an `Arc` internally and maintains a shared cookie jar.
79#[derive(Clone)]
80pub struct HttpAdapter {
81    client: Client,
82    config: HttpConfig,
83}
84
85impl HttpAdapter {
86    /// Create a new HTTP adapter with default configuration.
87    ///
88    /// # Example
89    ///
90    /// ```no_run
91    /// use stygian_graph::adapters::http::HttpAdapter;
92    /// let adapter = HttpAdapter::new();
93    /// ```
94    pub fn new() -> Self {
95        Self::with_config(HttpConfig::default())
96    }
97
98    /// Create an HTTP adapter with custom configuration.
99    ///
100    /// # Panics
101    ///
102    /// Panics only if TLS configuration is unavailable (extremely rare).
103    pub fn with_config(config: HttpConfig) -> Self {
104        let mut builder = Client::builder()
105            .timeout(config.timeout)
106            .cookie_store(true)
107            .gzip(true)
108            .brotli(true)
109            .use_rustls_tls()
110            .default_headers(Self::default_headers());
111
112        if let Some(ref proxy_url) = config.proxy_url
113            && let Ok(proxy) = Proxy::all(proxy_url)
114        {
115            builder = builder.proxy(proxy);
116        }
117
118        // SAFETY: TLS via rustls is always available; build() can only fail if
119        // TLS backend is completely absent, which cannot happen with use_rustls_tls().
120        #[allow(clippy::expect_used)]
121        let client = builder.build().expect("TLS backend unavailable");
122
123        Self { client, config }
124    }
125
126    /// Build a realistic set of browser-like default headers.
127    fn default_headers() -> header::HeaderMap {
128        let mut headers = header::HeaderMap::new();
129        headers.insert(
130            header::ACCEPT,
131            header::HeaderValue::from_static(
132                "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
133            ),
134        );
135        headers.insert(
136            header::ACCEPT_LANGUAGE,
137            header::HeaderValue::from_static("en-US,en;q=0.5"),
138        );
139        headers.insert(
140            header::ACCEPT_ENCODING,
141            header::HeaderValue::from_static("gzip, deflate, br"),
142        );
143        headers.insert("DNT", header::HeaderValue::from_static("1"));
144        headers.insert(
145            "Upgrade-Insecure-Requests",
146            header::HeaderValue::from_static("1"),
147        );
148        headers
149    }
150
151    /// Pick the next User-Agent via round-robin.
152    fn next_user_agent(&self) -> &'static str {
153        let idx = self
154            .config
155            .ua_counter
156            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
157        let len = USER_AGENTS.len();
158        USER_AGENTS.get(idx % len).copied().unwrap_or("")
159    }
160
161    /// Execute a single HTTP GET with the provided URL and return raw content.
162    async fn fetch(&self, url: &str) -> Result<(String, serde_json::Value)> {
163        let ua = if self.config.rotate_user_agent {
164            self.next_user_agent()
165        } else {
166            USER_AGENTS.first().copied().unwrap_or("")
167        };
168
169        let response = self
170            .client
171            .get(url)
172            .header(header::USER_AGENT, ua)
173            .send()
174            .await
175            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
176
177        let status = response.status();
178        let content_type = response
179            .headers()
180            .get(header::CONTENT_TYPE)
181            .and_then(|v| v.to_str().ok())
182            .unwrap_or("text/plain")
183            .to_string();
184
185        if !status.is_success() {
186            return Err(StygianError::Service(ServiceError::Unavailable(format!(
187                "HTTP {status} for {url}"
188            ))));
189        }
190
191        let body = response
192            .text()
193            .await
194            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
195
196        let metadata = serde_json::json!({
197            "status_code": status.as_u16(),
198            "content_type": content_type,
199            "user_agent": ua,
200            "url": url,
201        });
202
203        Ok((body, metadata))
204    }
205
206    /// Check whether a status code is a transient error worth retrying.
207    const fn is_retryable_status(code: u16) -> bool {
208        matches!(code, 429 | 500 | 502 | 503 | 504)
209    }
210}
211
212impl Default for HttpAdapter {
213    fn default() -> Self {
214        Self::new()
215    }
216}
217
218#[async_trait]
219impl ScrapingService for HttpAdapter {
220    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
221        let mut last_err: Option<StygianError> = None;
222
223        for attempt in 0..=self.config.max_retries {
224            if attempt > 0 {
225                // Exponential backoff: 1s, 2s, 4s, …
226                let delay = self.config.retry_base_delay * 2u32.saturating_pow(attempt - 1);
227                tokio::time::sleep(delay).await;
228            }
229
230            match self.fetch(&input.url).await {
231                Ok((data, metadata)) => {
232                    return Ok(ServiceOutput { data, metadata });
233                }
234                Err(StygianError::Service(ServiceError::Unavailable(ref msg))) => {
235                    // Check if we got a retryable HTTP status embedded in the message
236                    let retryable = msg
237                        .split_whitespace()
238                        .find_map(|w| w.parse::<u16>().ok())
239                        .is_none_or(Self::is_retryable_status);
240
241                    if retryable && attempt < self.config.max_retries {
242                        last_err = Some(StygianError::Service(ServiceError::Unavailable(
243                            msg.clone(),
244                        )));
245                        continue;
246                    }
247                    return Err(StygianError::Service(ServiceError::Unavailable(
248                        msg.clone(),
249                    )));
250                }
251                Err(e) => return Err(e),
252            }
253        }
254
255        Err(last_err.unwrap_or_else(|| {
256            StygianError::Service(ServiceError::Unavailable("Max retries exceeded".into()))
257        }))
258    }
259
260    fn name(&self) -> &'static str {
261        "http"
262    }
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn test_default_config() {
271        let config = HttpConfig::default();
272        assert_eq!(config.max_retries, 3);
273        assert!(config.rotate_user_agent);
274        assert!(config.proxy_url.is_none());
275    }
276
277    #[test]
278    fn test_user_agent_rotation() {
279        let adapter = HttpAdapter::new();
280        let ua1 = adapter.next_user_agent();
281        let ua2 = adapter.next_user_agent();
282        // Both should be in the pool
283        assert!(USER_AGENTS.contains(&ua1));
284        assert!(USER_AGENTS.contains(&ua2));
285        // Consecutive calls return different agents
286        assert_ne!(ua1, ua2);
287    }
288
289    #[test]
290    fn test_user_agent_wraps_around() {
291        let adapter = HttpAdapter::new();
292        // Exhaust one full rotation
293        for _ in 0..USER_AGENTS.len() {
294            adapter.next_user_agent();
295        }
296        // Still valid after wrap
297        let ua = adapter.next_user_agent();
298        assert!(USER_AGENTS.contains(&ua));
299    }
300
301    #[test]
302    fn test_retryable_status_codes() {
303        assert!(HttpAdapter::is_retryable_status(429));
304        assert!(HttpAdapter::is_retryable_status(503));
305        assert!(!HttpAdapter::is_retryable_status(404));
306        assert!(!HttpAdapter::is_retryable_status(200));
307    }
308
309    #[test]
310    fn test_adapter_name() {
311        let adapter = HttpAdapter::new();
312        assert_eq!(adapter.name(), "http");
313    }
314}