Skip to main content

stygian_charon/
har.rs

1use std::collections::BTreeMap;
2
3use serde_json::Value;
4use thiserror::Error;
5
6use crate::types::TransactionView;
7
8const MAX_HAR_BYTES: usize = 10 * 1024 * 1024;
9const MAX_HAR_ENTRIES: usize = 10_000;
10const MAX_HEADERS_PER_ENTRY: usize = 256;
11const MAX_URL_BYTES: usize = 8 * 1024;
12
13/// Errors returned while parsing HAR data.
14#[derive(Debug, Error)]
15pub enum HarError {
16    /// HAR payload is not valid JSON.
17    #[error("invalid HAR json: {0}")]
18    InvalidJson(#[from] serde_json::Error),
19    /// Expected HAR structure is missing required fields.
20    #[error("invalid HAR structure: {0}")]
21    InvalidStructure(&'static str),
22    /// HAR input exceeded a configured safety limit.
23    #[error("har input exceeds safety limit: {0}")]
24    LimitExceeded(&'static str),
25}
26
27/// Internal parsed HAR representation.
28#[derive(Debug, Clone)]
29pub struct ParsedHar {
30    /// Page title from the HAR pages section when available.
31    pub page_title: Option<String>,
32    /// Parsed request transactions.
33    pub requests: Vec<TransactionViewWithType>,
34}
35
36/// Transaction plus optional resource type.
37#[derive(Debug, Clone)]
38pub struct TransactionViewWithType {
39    /// Transaction used by the classifier.
40    pub transaction: TransactionView,
41    /// Resource type (document/script/xhr/etc.) if present in HAR.
42    pub resource_type: Option<String>,
43}
44
45impl TransactionViewWithType {
46    /// Convenience accessor for URL.
47    #[allow(clippy::missing_const_for_fn)]
48    #[must_use]
49    pub fn url(&self) -> &str {
50        &self.transaction.url
51    }
52
53    /// Convenience accessor for status.
54    #[must_use]
55    pub const fn status(&self) -> u16 {
56        self.transaction.status
57    }
58}
59
60impl From<TransactionViewWithType> for TransactionView {
61    fn from(value: TransactionViewWithType) -> Self {
62        value.transaction
63    }
64}
65
66/// Parse a HAR JSON string into transactions usable by the classifier.
67///
68/// # Errors
69///
70/// Returns [`HarError::InvalidJson`] when `har_json` is not valid JSON,
71/// [`HarError::InvalidStructure`] when required HAR fields are missing, or
72/// [`HarError::LimitExceeded`] when input safety limits are exceeded.
73pub fn parse_har_transactions(har_json: &str) -> Result<ParsedHar, HarError> {
74    if har_json.len() > MAX_HAR_BYTES {
75        return Err(HarError::LimitExceeded("har payload too large"));
76    }
77
78    let root: Value = serde_json::from_str(har_json)?;
79
80    let log = root
81        .get("log")
82        .ok_or(HarError::InvalidStructure("missing log object"))?;
83
84    let page_title = log
85        .get("pages")
86        .and_then(Value::as_array)
87        .and_then(|pages| pages.first())
88        .and_then(|page| page.get("title"))
89        .and_then(Value::as_str)
90        .map(str::to_owned);
91
92    let entries = log
93        .get("entries")
94        .and_then(Value::as_array)
95        .ok_or(HarError::InvalidStructure("missing entries array"))?;
96
97    if entries.len() > MAX_HAR_ENTRIES {
98        return Err(HarError::LimitExceeded("too many HAR entries"));
99    }
100
101    let mut requests: Vec<TransactionViewWithType> = Vec::new();
102
103    for entry in entries {
104        let request = entry
105            .get("request")
106            .ok_or(HarError::InvalidStructure("entry missing request"))?;
107        let response = entry
108            .get("response")
109            .ok_or(HarError::InvalidStructure("entry missing response"))?;
110
111        let url = request
112            .get("url")
113            .and_then(Value::as_str)
114            .map(str::to_owned)
115            .ok_or(HarError::InvalidStructure("entry request missing url"))?;
116
117        if url.len() > MAX_URL_BYTES {
118            return Err(HarError::LimitExceeded("request url too large"));
119        }
120
121        let status = response
122            .get("status")
123            .and_then(Value::as_u64)
124            .and_then(|x| u16::try_from(x).ok())
125            .ok_or(HarError::InvalidStructure("entry response missing status"))?;
126
127        let headers = match response.get("headers").and_then(Value::as_array) {
128            Some(headers) => {
129                if headers.len() > MAX_HEADERS_PER_ENTRY {
130                    return Err(HarError::LimitExceeded("too many response headers"));
131                }
132                extract_headers(headers)
133            }
134            None => BTreeMap::new(),
135        };
136
137        let body_snippet = response
138            .get("content")
139            .and_then(|content| content.get("text"))
140            .and_then(Value::as_str)
141            .map(|text| text.chars().take(2_048).collect::<String>());
142
143        let tx = TransactionView {
144            url,
145            status,
146            response_headers: headers,
147            response_body_snippet: body_snippet,
148        };
149
150        requests.push(TransactionViewWithType {
151            transaction: tx,
152            resource_type: entry
153                .get("_resourceType")
154                .and_then(Value::as_str)
155                .map(str::to_owned),
156        });
157    }
158
159    Ok(ParsedHar {
160        page_title,
161        requests,
162    })
163}
164
165fn extract_headers(headers: &[Value]) -> BTreeMap<String, String> {
166    let mut out = BTreeMap::new();
167    for header in headers {
168        let name = header
169            .get("name")
170            .and_then(Value::as_str)
171            .map(str::to_owned);
172        let value = header
173            .get("value")
174            .and_then(Value::as_str)
175            .map(str::to_owned);
176
177        if let (Some(k), Some(v)) = (name, value) {
178            let _prev = out.insert(k, v);
179        }
180    }
181    out
182}
183
184#[cfg(test)]
185#[allow(
186    clippy::unwrap_used,
187    clippy::expect_used,
188    clippy::panic,
189    clippy::indexing_slicing
190)]
191mod tests {
192    use super::*;
193
194    #[test]
195    fn parses_minimal_har() {
196        let json = r#"{
197            "log": {
198                "pages": [{"title": "https://example.com"}],
199                "entries": [
200                    {
201                        "_resourceType": "document",
202                        "request": {"url": "https://example.com"},
203                        "response": {
204                            "status": 403,
205                            "headers": [{"name": "server", "value": "cloudflare"}],
206                            "content": {"text": "Attention Required! | Cloudflare"}
207                        }
208                    }
209                ]
210            }
211        }"#;
212
213        let parsed_result = parse_har_transactions(json);
214        assert!(parsed_result.is_ok(), "parse should succeed");
215
216        let Ok(parsed) = parsed_result else {
217            return;
218        };
219
220        assert_eq!(parsed.page_title.as_deref(), Some("https://example.com"));
221        assert_eq!(parsed.requests.len(), 1);
222
223        let first = parsed.requests.first();
224        assert!(first.is_some(), "parsed requests unexpectedly empty");
225        if let Some(first) = first {
226            assert_eq!(first.status(), 403);
227            assert_eq!(first.url(), "https://example.com");
228        }
229    }
230
231    #[test]
232    fn rejects_oversized_har_payload() {
233        let oversized = " ".repeat(MAX_HAR_BYTES + 1);
234
235        let result = parse_har_transactions(&oversized);
236
237        assert!(matches!(
238            result,
239            Err(HarError::LimitExceeded("har payload too large"))
240        ));
241    }
242
243    #[test]
244    fn rejects_too_many_entries() {
245        let entries = std::iter::repeat_n(
246            r#"{"request":{"url":"https://example.com"},"response":{"status":200}}"#,
247            MAX_HAR_ENTRIES + 1,
248        )
249        .collect::<Vec<_>>()
250        .join(",");
251        let json = format!(r#"{{"log":{{"entries":[{entries}]}}}}"#);
252
253        let result = parse_har_transactions(&json);
254
255        assert!(matches!(
256            result,
257            Err(HarError::LimitExceeded("too many HAR entries"))
258        ));
259    }
260
261    #[test]
262    fn rejects_too_many_response_headers() {
263        let headers = std::iter::repeat_n(
264            r#"{"name":"server","value":"cloudflare"}"#,
265            MAX_HEADERS_PER_ENTRY + 1,
266        )
267        .collect::<Vec<_>>()
268        .join(",");
269        let json = format!(
270            r#"{{"log":{{"entries":[{{"request":{{"url":"https://example.com"}},"response":{{"status":403,"headers":[{headers}]}}}}]}}}}"#
271        );
272
273        let result = parse_har_transactions(&json);
274
275        assert!(matches!(
276            result,
277            Err(HarError::LimitExceeded("too many response headers"))
278        ));
279    }
280}