stygian_graph/adapters/
http.rs1use std::time::Duration;
28
29use async_trait::async_trait;
30use reqwest::{Client, Proxy, header};
31
32use crate::domain::error::{Result, ServiceError, StygianError};
33use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
34
35static USER_AGENTS: &[&str] = &[
37 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
38 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
39 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
40 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
41 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:133.0) Gecko/20100101 Firefox/133.0",
42 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
43];
44
45#[derive(Debug, Clone)]
47pub struct HttpConfig {
48 pub timeout: Duration,
50 pub max_retries: u32,
52 pub retry_base_delay: Duration,
54 pub proxy_url: Option<String>,
56 pub rotate_user_agent: bool,
58 pub(crate) ua_counter: std::sync::Arc<std::sync::atomic::AtomicUsize>,
60}
61
62impl Default for HttpConfig {
63 fn default() -> Self {
64 Self {
65 timeout: Duration::from_secs(30),
66 max_retries: 3,
67 retry_base_delay: Duration::from_secs(1),
68 proxy_url: None,
69 rotate_user_agent: true,
70 ua_counter: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
71 }
72 }
73}
74
75#[derive(Clone)]
80pub struct HttpAdapter {
81 client: Client,
82 config: HttpConfig,
83}
84
85impl HttpAdapter {
86 #[must_use]
95 pub fn new() -> Self {
96 Self::with_config(HttpConfig::default())
97 }
98
99 #[must_use]
105 pub fn with_config(config: HttpConfig) -> Self {
106 let mut builder = Client::builder()
107 .timeout(config.timeout)
108 .cookie_store(true)
109 .gzip(true)
110 .brotli(true)
111 .use_rustls_tls()
112 .default_headers(Self::default_headers());
113
114 if let Some(ref proxy_url) = config.proxy_url
115 && let Ok(proxy) = Proxy::all(proxy_url)
116 {
117 builder = builder.proxy(proxy);
118 }
119
120 #[allow(clippy::expect_used)]
123 let client = builder.build().expect("TLS backend unavailable");
124
125 Self { client, config }
126 }
127
128 fn default_headers() -> header::HeaderMap {
130 let mut headers = header::HeaderMap::new();
131 headers.insert(
132 header::ACCEPT,
133 header::HeaderValue::from_static(
134 "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
135 ),
136 );
137 headers.insert(
138 header::ACCEPT_LANGUAGE,
139 header::HeaderValue::from_static("en-US,en;q=0.5"),
140 );
141 headers.insert(
142 header::ACCEPT_ENCODING,
143 header::HeaderValue::from_static("gzip, deflate, br"),
144 );
145 headers.insert("DNT", header::HeaderValue::from_static("1"));
146 headers.insert(
147 "Upgrade-Insecure-Requests",
148 header::HeaderValue::from_static("1"),
149 );
150 headers
151 }
152
153 fn next_user_agent(&self) -> &'static str {
155 let idx = self
156 .config
157 .ua_counter
158 .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
159 let len = USER_AGENTS.len();
160 USER_AGENTS.get(idx % len).copied().unwrap_or("")
161 }
162
163 async fn fetch(&self, url: &str) -> Result<(String, serde_json::Value)> {
165 let ua = if self.config.rotate_user_agent {
166 self.next_user_agent()
167 } else {
168 USER_AGENTS.first().copied().unwrap_or("")
169 };
170
171 let response = self
172 .client
173 .get(url)
174 .header(header::USER_AGENT, ua)
175 .send()
176 .await
177 .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
178
179 let status = response.status();
180 let content_type = response
181 .headers()
182 .get(header::CONTENT_TYPE)
183 .and_then(|v| v.to_str().ok())
184 .unwrap_or("text/plain")
185 .to_string();
186
187 if !status.is_success() {
188 return Err(StygianError::Service(ServiceError::Unavailable(format!(
189 "HTTP {status} for {url}"
190 ))));
191 }
192
193 let body = response
194 .text()
195 .await
196 .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
197
198 let metadata = serde_json::json!({
199 "status_code": status.as_u16(),
200 "content_type": content_type,
201 "user_agent": ua,
202 "url": url,
203 });
204
205 Ok((body, metadata))
206 }
207
208 const fn is_retryable_status(code: u16) -> bool {
210 matches!(code, 429 | 500 | 502 | 503 | 504)
211 }
212}
213
214impl Default for HttpAdapter {
215 fn default() -> Self {
216 Self::new()
217 }
218}
219
220#[async_trait]
221impl ScrapingService for HttpAdapter {
222 async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
223 let mut last_err: Option<StygianError> = None;
224
225 for attempt in 0..=self.config.max_retries {
226 if attempt > 0 {
227 let delay = self.config.retry_base_delay * 2u32.saturating_pow(attempt - 1);
229 tokio::time::sleep(delay).await;
230 }
231
232 match self.fetch(&input.url).await {
233 Ok((data, metadata)) => {
234 return Ok(ServiceOutput { data, metadata });
235 }
236 Err(StygianError::Service(ServiceError::Unavailable(ref msg))) => {
237 let retryable = msg
239 .split_whitespace()
240 .find_map(|w| w.parse::<u16>().ok())
241 .is_none_or(Self::is_retryable_status);
242
243 if retryable && attempt < self.config.max_retries {
244 last_err = Some(StygianError::Service(ServiceError::Unavailable(
245 msg.clone(),
246 )));
247 continue;
248 }
249 return Err(StygianError::Service(ServiceError::Unavailable(
250 msg.clone(),
251 )));
252 }
253 Err(e) => return Err(e),
254 }
255 }
256
257 Err(last_err.unwrap_or_else(|| {
258 StygianError::Service(ServiceError::Unavailable("Max retries exceeded".into()))
259 }))
260 }
261
262 fn name(&self) -> &'static str {
263 "http"
264 }
265}
266
267#[cfg(test)]
268mod tests {
269 use super::*;
270
271 #[test]
272 fn test_default_config() {
273 let config = HttpConfig::default();
274 assert_eq!(config.max_retries, 3);
275 assert!(config.rotate_user_agent);
276 assert!(config.proxy_url.is_none());
277 }
278
279 #[test]
280 fn test_user_agent_rotation() {
281 let adapter = HttpAdapter::new();
282 let ua1 = adapter.next_user_agent();
283 let ua2 = adapter.next_user_agent();
284 assert!(USER_AGENTS.contains(&ua1));
286 assert!(USER_AGENTS.contains(&ua2));
287 assert_ne!(ua1, ua2);
289 }
290
291 #[test]
292 fn test_user_agent_wraps_around() {
293 let adapter = HttpAdapter::new();
294 for _ in 0..USER_AGENTS.len() {
296 adapter.next_user_agent();
297 }
298 let ua = adapter.next_user_agent();
300 assert!(USER_AGENTS.contains(&ua));
301 }
302
303 #[test]
304 fn test_retryable_status_codes() {
305 assert!(HttpAdapter::is_retryable_status(429));
306 assert!(HttpAdapter::is_retryable_status(503));
307 assert!(!HttpAdapter::is_retryable_status(404));
308 assert!(!HttpAdapter::is_retryable_status(200));
309 }
310
311 #[test]
312 fn test_adapter_name() {
313 let adapter = HttpAdapter::new();
314 assert_eq!(adapter.name(), "http");
315 }
316}