stygian_graph/adapters/
http.rs1use std::time::Duration;
28
29use async_trait::async_trait;
30use reqwest::{Client, Proxy, header};
31
32use crate::domain::error::{Result, ServiceError, StygianError};
33use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
34
35static USER_AGENTS: &[&str] = &[
37 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
38 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
39 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
40 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
41 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:133.0) Gecko/20100101 Firefox/133.0",
42 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
43];
44
45#[derive(Debug, Clone)]
47pub struct HttpConfig {
48 pub timeout: Duration,
50 pub max_retries: u32,
52 pub retry_base_delay: Duration,
54 pub proxy_url: Option<String>,
56 pub rotate_user_agent: bool,
58 pub(crate) ua_counter: std::sync::Arc<std::sync::atomic::AtomicUsize>,
60}
61
62impl Default for HttpConfig {
63 fn default() -> Self {
64 Self {
65 timeout: Duration::from_secs(30),
66 max_retries: 3,
67 retry_base_delay: Duration::from_secs(1),
68 proxy_url: None,
69 rotate_user_agent: true,
70 ua_counter: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
71 }
72 }
73}
74
75#[derive(Clone)]
80pub struct HttpAdapter {
81 client: Client,
82 config: HttpConfig,
83}
84
85impl HttpAdapter {
86 pub fn new() -> Self {
95 Self::with_config(HttpConfig::default())
96 }
97
98 pub fn with_config(config: HttpConfig) -> Self {
104 let mut builder = Client::builder()
105 .timeout(config.timeout)
106 .cookie_store(true)
107 .gzip(true)
108 .brotli(true)
109 .use_rustls_tls()
110 .default_headers(Self::default_headers());
111
112 if let Some(ref proxy_url) = config.proxy_url
113 && let Ok(proxy) = Proxy::all(proxy_url)
114 {
115 builder = builder.proxy(proxy);
116 }
117
118 #[allow(clippy::expect_used)]
121 let client = builder.build().expect("TLS backend unavailable");
122
123 Self { client, config }
124 }
125
126 fn default_headers() -> header::HeaderMap {
128 let mut headers = header::HeaderMap::new();
129 headers.insert(
130 header::ACCEPT,
131 header::HeaderValue::from_static(
132 "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
133 ),
134 );
135 headers.insert(
136 header::ACCEPT_LANGUAGE,
137 header::HeaderValue::from_static("en-US,en;q=0.5"),
138 );
139 headers.insert(
140 header::ACCEPT_ENCODING,
141 header::HeaderValue::from_static("gzip, deflate, br"),
142 );
143 headers.insert("DNT", header::HeaderValue::from_static("1"));
144 headers.insert(
145 "Upgrade-Insecure-Requests",
146 header::HeaderValue::from_static("1"),
147 );
148 headers
149 }
150
151 fn next_user_agent(&self) -> &'static str {
153 let idx = self
154 .config
155 .ua_counter
156 .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
157 let len = USER_AGENTS.len();
158 USER_AGENTS.get(idx % len).copied().unwrap_or("")
159 }
160
161 async fn fetch(&self, url: &str) -> Result<(String, serde_json::Value)> {
163 let ua = if self.config.rotate_user_agent {
164 self.next_user_agent()
165 } else {
166 USER_AGENTS.first().copied().unwrap_or("")
167 };
168
169 let response = self
170 .client
171 .get(url)
172 .header(header::USER_AGENT, ua)
173 .send()
174 .await
175 .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
176
177 let status = response.status();
178 let content_type = response
179 .headers()
180 .get(header::CONTENT_TYPE)
181 .and_then(|v| v.to_str().ok())
182 .unwrap_or("text/plain")
183 .to_string();
184
185 if !status.is_success() {
186 return Err(StygianError::Service(ServiceError::Unavailable(format!(
187 "HTTP {status} for {url}"
188 ))));
189 }
190
191 let body = response
192 .text()
193 .await
194 .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
195
196 let metadata = serde_json::json!({
197 "status_code": status.as_u16(),
198 "content_type": content_type,
199 "user_agent": ua,
200 "url": url,
201 });
202
203 Ok((body, metadata))
204 }
205
206 const fn is_retryable_status(code: u16) -> bool {
208 matches!(code, 429 | 500 | 502 | 503 | 504)
209 }
210}
211
212impl Default for HttpAdapter {
213 fn default() -> Self {
214 Self::new()
215 }
216}
217
218#[async_trait]
219impl ScrapingService for HttpAdapter {
220 async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
221 let mut last_err: Option<StygianError> = None;
222
223 for attempt in 0..=self.config.max_retries {
224 if attempt > 0 {
225 let delay = self.config.retry_base_delay * 2u32.saturating_pow(attempt - 1);
227 tokio::time::sleep(delay).await;
228 }
229
230 match self.fetch(&input.url).await {
231 Ok((data, metadata)) => {
232 return Ok(ServiceOutput { data, metadata });
233 }
234 Err(StygianError::Service(ServiceError::Unavailable(ref msg))) => {
235 let retryable = msg
237 .split_whitespace()
238 .find_map(|w| w.parse::<u16>().ok())
239 .is_none_or(Self::is_retryable_status);
240
241 if retryable && attempt < self.config.max_retries {
242 last_err = Some(StygianError::Service(ServiceError::Unavailable(
243 msg.clone(),
244 )));
245 continue;
246 }
247 return Err(StygianError::Service(ServiceError::Unavailable(
248 msg.clone(),
249 )));
250 }
251 Err(e) => return Err(e),
252 }
253 }
254
255 Err(last_err.unwrap_or_else(|| {
256 StygianError::Service(ServiceError::Unavailable("Max retries exceeded".into()))
257 }))
258 }
259
260 fn name(&self) -> &'static str {
261 "http"
262 }
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 #[test]
270 fn test_default_config() {
271 let config = HttpConfig::default();
272 assert_eq!(config.max_retries, 3);
273 assert!(config.rotate_user_agent);
274 assert!(config.proxy_url.is_none());
275 }
276
277 #[test]
278 fn test_user_agent_rotation() {
279 let adapter = HttpAdapter::new();
280 let ua1 = adapter.next_user_agent();
281 let ua2 = adapter.next_user_agent();
282 assert!(USER_AGENTS.contains(&ua1));
284 assert!(USER_AGENTS.contains(&ua2));
285 assert_ne!(ua1, ua2);
287 }
288
289 #[test]
290 fn test_user_agent_wraps_around() {
291 let adapter = HttpAdapter::new();
292 for _ in 0..USER_AGENTS.len() {
294 adapter.next_user_agent();
295 }
296 let ua = adapter.next_user_agent();
298 assert!(USER_AGENTS.contains(&ua));
299 }
300
301 #[test]
302 fn test_retryable_status_codes() {
303 assert!(HttpAdapter::is_retryable_status(429));
304 assert!(HttpAdapter::is_retryable_status(503));
305 assert!(!HttpAdapter::is_retryable_status(404));
306 assert!(!HttpAdapter::is_retryable_status(200));
307 }
308
309 #[test]
310 fn test_adapter_name() {
311 let adapter = HttpAdapter::new();
312 assert_eq!(adapter.name(), "http");
313 }
314}