Skip to main content

stygian_charon/
har.rs

1use std::collections::BTreeMap;
2
3use serde_json::Value;
4use thiserror::Error;
5
6use crate::types::TransactionView;
7
8const MAX_HAR_BYTES: usize = 10 * 1024 * 1024;
9const MAX_HAR_ENTRIES: usize = 10_000;
10const MAX_HEADERS_PER_ENTRY: usize = 256;
11const MAX_URL_BYTES: usize = 8 * 1024;
12
13/// Errors returned while parsing HAR data.
14#[derive(Debug, Error)]
15pub enum HarError {
16    /// HAR payload is not valid JSON.
17    #[error("invalid HAR json: {0}")]
18    InvalidJson(#[from] serde_json::Error),
19    /// Expected HAR structure is missing required fields.
20    #[error("invalid HAR structure: {0}")]
21    InvalidStructure(&'static str),
22    /// HAR input exceeded a configured safety limit.
23    #[error("har input exceeds safety limit: {0}")]
24    LimitExceeded(&'static str),
25}
26
27/// Internal parsed HAR representation.
28#[derive(Debug, Clone)]
29pub struct ParsedHar {
30    /// Page title from the HAR pages section when available.
31    pub page_title: Option<String>,
32    /// Parsed request transactions.
33    pub requests: Vec<TransactionViewWithType>,
34}
35
36/// Transaction plus optional resource type.
37#[derive(Debug, Clone)]
38pub struct TransactionViewWithType {
39    /// Transaction used by the classifier.
40    pub transaction: TransactionView,
41    /// Resource type (document/script/xhr/etc.) if present in HAR.
42    pub resource_type: Option<String>,
43}
44
45impl TransactionViewWithType {
46    /// Convenience accessor for URL.
47    #[allow(clippy::missing_const_for_fn)]
48    #[must_use]
49    pub fn url(&self) -> &str {
50        &self.transaction.url
51    }
52
53    /// Convenience accessor for status.
54    #[must_use]
55    pub const fn status(&self) -> u16 {
56        self.transaction.status
57    }
58}
59
60impl From<TransactionViewWithType> for TransactionView {
61    fn from(value: TransactionViewWithType) -> Self {
62        value.transaction
63    }
64}
65
66/// Parse a HAR JSON string into transactions usable by the classifier.
67///
68/// # Errors
69///
70/// Returns [`HarError::InvalidJson`] when `har_json` is not valid JSON,
71/// [`HarError::InvalidStructure`] when required HAR fields are missing, or
72/// [`HarError::LimitExceeded`] when input safety limits are exceeded.
73pub fn parse_har_transactions(har_json: &str) -> Result<ParsedHar, HarError> {
74    if har_json.len() > MAX_HAR_BYTES {
75        return Err(HarError::LimitExceeded("har payload too large"));
76    }
77
78    let root: Value = serde_json::from_str(har_json)?;
79
80    let log = root
81        .get("log")
82        .ok_or(HarError::InvalidStructure("missing log object"))?;
83
84    let page_title = log
85        .get("pages")
86        .and_then(Value::as_array)
87        .and_then(|pages| pages.first())
88        .and_then(|page| page.get("title"))
89        .and_then(Value::as_str)
90        .map(str::to_owned);
91
92    let entries = log
93        .get("entries")
94        .and_then(Value::as_array)
95        .ok_or(HarError::InvalidStructure("missing entries array"))?;
96
97    if entries.len() > MAX_HAR_ENTRIES {
98        return Err(HarError::LimitExceeded("too many HAR entries"));
99    }
100
101    let mut requests: Vec<TransactionViewWithType> = Vec::new();
102
103    for entry in entries {
104        let request = entry
105            .get("request")
106            .ok_or(HarError::InvalidStructure("entry missing request"))?;
107        let response = entry
108            .get("response")
109            .ok_or(HarError::InvalidStructure("entry missing response"))?;
110
111        let url = request
112            .get("url")
113            .and_then(Value::as_str)
114            .map(str::to_owned)
115            .ok_or(HarError::InvalidStructure("entry request missing url"))?;
116
117        if url.len() > MAX_URL_BYTES {
118            return Err(HarError::LimitExceeded("request url too large"));
119        }
120
121        let status = response
122            .get("status")
123            .and_then(Value::as_u64)
124            .and_then(|x| u16::try_from(x).ok())
125            .ok_or(HarError::InvalidStructure("entry response missing status"))?;
126
127        let headers = match response.get("headers").and_then(Value::as_array) {
128            Some(headers) => {
129                if headers.len() > MAX_HEADERS_PER_ENTRY {
130                    return Err(HarError::LimitExceeded("too many response headers"));
131                }
132                extract_headers(headers)
133            }
134            None => BTreeMap::new(),
135        };
136
137        let body_snippet = response
138            .get("content")
139            .and_then(|content| content.get("text"))
140            .and_then(Value::as_str)
141            .map(|text| text.chars().take(2_048).collect::<String>());
142
143        let tx = TransactionView {
144            url,
145            status,
146            response_headers: headers,
147            response_body_snippet: body_snippet,
148        };
149
150        requests.push(TransactionViewWithType {
151            transaction: tx,
152            resource_type: entry
153                .get("_resourceType")
154                .and_then(Value::as_str)
155                .map(str::to_owned),
156        });
157    }
158
159    Ok(ParsedHar {
160        page_title,
161        requests,
162    })
163}
164
165fn extract_headers(headers: &[Value]) -> BTreeMap<String, String> {
166    let mut out = BTreeMap::new();
167    for header in headers {
168        let name = header
169            .get("name")
170            .and_then(Value::as_str)
171            .map(str::to_owned);
172        let value = header
173            .get("value")
174            .and_then(Value::as_str)
175            .map(str::to_owned);
176
177        if let (Some(k), Some(v)) = (name, value) {
178            let _prev = out.insert(k, v);
179        }
180    }
181    out
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn parses_minimal_har() {
190        let json = r#"{
191            "log": {
192                "pages": [{"title": "https://example.com"}],
193                "entries": [
194                    {
195                        "_resourceType": "document",
196                        "request": {"url": "https://example.com"},
197                        "response": {
198                            "status": 403,
199                            "headers": [{"name": "server", "value": "cloudflare"}],
200                            "content": {"text": "Attention Required! | Cloudflare"}
201                        }
202                    }
203                ]
204            }
205        }"#;
206
207        let parsed_result = parse_har_transactions(json);
208        assert!(parsed_result.is_ok(), "parse should succeed");
209
210        let Ok(parsed) = parsed_result else {
211            return;
212        };
213
214        assert_eq!(parsed.page_title.as_deref(), Some("https://example.com"));
215        assert_eq!(parsed.requests.len(), 1);
216
217        let first = parsed.requests.first();
218        assert!(first.is_some(), "parsed requests unexpectedly empty");
219        if let Some(first) = first {
220            assert_eq!(first.status(), 403);
221            assert_eq!(first.url(), "https://example.com");
222        }
223    }
224
225    #[test]
226    fn rejects_oversized_har_payload() {
227        let oversized = " ".repeat(MAX_HAR_BYTES + 1);
228
229        let result = parse_har_transactions(&oversized);
230
231        assert!(matches!(
232            result,
233            Err(HarError::LimitExceeded("har payload too large"))
234        ));
235    }
236
237    #[test]
238    fn rejects_too_many_entries() {
239        let entries = std::iter::repeat_n(
240            r#"{"request":{"url":"https://example.com"},"response":{"status":200}}"#,
241            MAX_HAR_ENTRIES + 1,
242        )
243        .collect::<Vec<_>>()
244        .join(",");
245        let json = format!(r#"{{"log":{{"entries":[{entries}]}}}}"#);
246
247        let result = parse_har_transactions(&json);
248
249        assert!(matches!(
250            result,
251            Err(HarError::LimitExceeded("too many HAR entries"))
252        ));
253    }
254
255    #[test]
256    fn rejects_too_many_response_headers() {
257        let headers = std::iter::repeat_n(
258            r#"{"name":"server","value":"cloudflare"}"#,
259            MAX_HEADERS_PER_ENTRY + 1,
260        )
261        .collect::<Vec<_>>()
262        .join(",");
263        let json = format!(
264            r#"{{"log":{{"entries":[{{"request":{{"url":"https://example.com"}},"response":{{"status":403,"headers":[{headers}]}}}}]}}}}"#
265        );
266
267        let result = parse_har_transactions(&json);
268
269        assert!(matches!(
270            result,
271            Err(HarError::LimitExceeded("too many response headers"))
272        ));
273    }
274}