stygian_graph/application/
health.rs

1//! Health check reporting for Kubernetes liveness and readiness probes.
2//!
3//! Provides structured health-check types and a [`HealthReporter`](health::HealthReporter) for aggregating
4//! component-level health into an overall [`HealthReport`](health::HealthReport).
5//!
6//! # Example
7//!
8//! ```
9//! use stygian_graph::application::health::{HealthReporter, HealthStatus, ComponentHealth};
10//!
11//! let mut reporter = HealthReporter::new();
12//! reporter.register("database", HealthStatus::Healthy);
13//! reporter.register("cache", HealthStatus::Degraded("High latency".to_string()));
14//!
15//! let report = reporter.report();
16//! assert!(report.is_ready());  // Degraded is still operational ⇒ ready
17//! assert!(report.is_live());   // Still alive while degraded
18//! ```
19
20use std::collections::HashMap;
21use std::time::SystemTime;
22
23use parking_lot::RwLock;
24
25use serde::{Deserialize, Serialize};
26
27// ─── HealthStatus ─────────────────────────────────────────────────────────────
28
29/// The health status of a single component.
30#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
31#[serde(tag = "status", content = "reason", rename_all = "lowercase")]
32pub enum HealthStatus {
33    /// Component is operating normally.
34    Healthy,
35    /// Component is partially impaired but still serving requests.
36    Degraded(String),
37    /// Component is non-functional; requests will fail.
38    Unhealthy(String),
39}
40
41impl HealthStatus {
42    /// Returns `true` only when the component is fully healthy.
43    ///
44    /// # Example
45    ///
46    /// ```
47    /// use stygian_graph::application::health::HealthStatus;
48    /// assert!(HealthStatus::Healthy.is_healthy());
49    /// assert!(!HealthStatus::Degraded("latency".into()).is_healthy());
50    /// ```
51    pub const fn is_healthy(&self) -> bool {
52        matches!(self, Self::Healthy)
53    }
54
55    /// Returns `true` when the component can still serve requests (healthy or degraded).
56    ///
57    /// # Example
58    ///
59    /// ```
60    /// use stygian_graph::application::health::HealthStatus;
61    /// assert!(HealthStatus::Healthy.is_operational());
62    /// assert!(HealthStatus::Degraded("high latency".into()).is_operational());
63    /// assert!(!HealthStatus::Unhealthy("connection refused".into()).is_operational());
64    /// ```
65    pub const fn is_operational(&self) -> bool {
66        !matches!(self, Self::Unhealthy(_))
67    }
68}
69
70impl std::fmt::Display for HealthStatus {
71    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72        match self {
73            Self::Healthy => write!(f, "healthy"),
74            Self::Degraded(r) => write!(f, "degraded: {r}"),
75            Self::Unhealthy(r) => write!(f, "unhealthy: {r}"),
76        }
77    }
78}
79
80// ─── ComponentHealth ─────────────────────────────────────────────────────────
81
82/// Health state for a single named component.
83///
84/// Returned as part of a [`HealthReport`].
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct ComponentHealth {
87    /// Component identifier (e.g. `"database"`, `"cache"`, `"worker_pool"`)
88    pub name: String,
89    /// Component status
90    pub status: HealthStatus,
91    /// Optional free-form details (timings, error messages, etc.)
92    #[serde(skip_serializing_if = "Option::is_none")]
93    pub details: Option<serde_json::Value>,
94}
95
96impl ComponentHealth {
97    /// Create a healthy component with no extra details.
98    ///
99    /// # Example
100    ///
101    /// ```
102    /// use stygian_graph::application::health::{ComponentHealth, HealthStatus};
103    ///
104    /// let c = ComponentHealth::healthy("cache");
105    /// assert_eq!(c.status, HealthStatus::Healthy);
106    /// ```
107    pub fn healthy(name: impl Into<String>) -> Self {
108        Self {
109            name: name.into(),
110            status: HealthStatus::Healthy,
111            details: None,
112        }
113    }
114
115    /// Create a degraded component.
116    ///
117    /// # Example
118    ///
119    /// ```
120    /// use stygian_graph::application::health::ComponentHealth;
121    ///
122    /// let c = ComponentHealth::degraded("database", "replication lag 5s");
123    /// assert!(!c.status.is_healthy());
124    /// ```
125    pub fn degraded(name: impl Into<String>, reason: impl Into<String>) -> Self {
126        Self {
127            name: name.into(),
128            status: HealthStatus::Degraded(reason.into()),
129            details: None,
130        }
131    }
132
133    /// Create an unhealthy component.
134    ///
135    /// # Example
136    ///
137    /// ```
138    /// use stygian_graph::application::health::ComponentHealth;
139    ///
140    /// let c = ComponentHealth::unhealthy("valkey", "connection refused");
141    /// assert!(!c.status.is_operational());
142    /// ```
143    pub fn unhealthy(name: impl Into<String>, reason: impl Into<String>) -> Self {
144        Self {
145            name: name.into(),
146            status: HealthStatus::Unhealthy(reason.into()),
147            details: None,
148        }
149    }
150
151    /// Attach arbitrary JSON details to this component.
152    ///
153    /// # Example
154    ///
155    /// ```
156    /// use stygian_graph::application::health::ComponentHealth;
157    ///
158    /// let c = ComponentHealth::healthy("http_pool")
159    ///     .with_details(serde_json::json!({ "idle_connections": 8, "max": 32 }));
160    ///
161    /// assert!(c.details.is_some());
162    /// ```
163    #[must_use]
164    pub fn with_details(mut self, details: serde_json::Value) -> Self {
165        self.details = Some(details);
166        self
167    }
168}
169
170// ─── HealthReport ────────────────────────────────────────────────────────────
171
172/// Aggregated health report for all registered components.
173///
174/// Returned by [`HealthReporter::report`].
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct HealthReport {
177    /// Overall system status (worst of all components)
178    pub overall: HealthStatus,
179    /// Per-component breakdown
180    pub components: Vec<ComponentHealth>,
181    /// When this report was generated (Unix seconds)
182    #[serde(with = "system_time_serde")]
183    pub checked_at: SystemTime,
184}
185
186impl HealthReport {
187    /// Returns `true` when the system is ready to serve traffic.
188    ///
189    /// The system is ready only when **all** components are healthy or degraded
190    /// (Kubernetes readiness probe).
191    ///
192    /// # Example
193    ///
194    /// ```
195    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
196    ///
197    /// let mut r = HealthReporter::new();
198    /// r.register("db", HealthStatus::Healthy);
199    /// assert!(r.report().is_ready());
200    /// ```
201    pub fn is_ready(&self) -> bool {
202        self.components.iter().all(|c| c.status.is_operational())
203    }
204
205    /// Returns `true` while the process should continue running.
206    ///
207    /// The process is considered alive unless every component is unhealthy
208    /// (Kubernetes liveness probe).
209    ///
210    /// # Example
211    ///
212    /// ```
213    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
214    ///
215    /// let r = HealthReporter::new();
216    /// r.register("db", HealthStatus::Unhealthy("disk full".into()));
217    /// r.register("cache", HealthStatus::Healthy);
218    /// // One unhealthy component doesn't kill the process while others are healthy
219    /// assert!(r.report().is_live());
220    /// ```
221    pub fn is_live(&self) -> bool {
222        // Dead when ALL components are unhealthy (or no components registered)
223        if self.components.is_empty() {
224            return true;
225        }
226        self.components.iter().any(|c| c.status.is_operational())
227    }
228
229    /// HTTP status code suitable for a health-check endpoint.
230    ///
231    /// Returns `200` when ready, `503` when not.
232    ///
233    /// # Example
234    ///
235    /// ```
236    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
237    ///
238    /// let mut r = HealthReporter::new();
239    /// r.register("db", HealthStatus::Healthy);
240    /// assert_eq!(r.report().http_status_code(), 200u16);
241    /// ```
242    pub fn http_status_code(&self) -> u16 {
243        if self.is_ready() { 200 } else { 503 }
244    }
245}
246
247// ─── System-time serde helper ─────────────────────────────────────────────────
248
249mod system_time_serde {
250    use serde::{Deserialize, Deserializer, Serializer};
251    use std::time::{SystemTime, UNIX_EPOCH};
252
253    pub fn serialize<S: Serializer>(t: &SystemTime, s: S) -> Result<S::Ok, S::Error> {
254        let secs = t.duration_since(UNIX_EPOCH).unwrap_or_default().as_secs();
255        s.serialize_u64(secs)
256    }
257
258    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<SystemTime, D::Error> {
259        let secs = u64::deserialize(d)?;
260        Ok(UNIX_EPOCH + std::time::Duration::from_secs(secs))
261    }
262}
263
264// ─── HealthReporter ──────────────────────────────────────────────────────────
265
266/// Collects component-level health checks and produces a [`HealthReport`].
267///
268/// Thread-safe; cheaply cloneable via `Arc` patterns.
269///
270/// # Example
271///
272/// ```
273/// use stygian_graph::application::health::{HealthReporter, HealthStatus, ComponentHealth};
274///
275/// let mut reporter = HealthReporter::new();
276/// reporter.register("database", HealthStatus::Healthy);
277/// reporter.register_component(
278///     ComponentHealth::degraded("cache", "latency p99 > 100ms")
279///         .with_details(serde_json::json!({ "p99_ms": 142 }))
280/// );
281///
282/// let report = reporter.report();
283/// assert_eq!(report.http_status_code(), 200u16); // degraded is operational
284/// ```
285pub struct HealthReporter {
286    components: RwLock<HashMap<String, ComponentHealth>>,
287}
288
289impl HealthReporter {
290    /// Create an empty reporter.
291    ///
292    /// # Example
293    ///
294    /// ```
295    /// use stygian_graph::application::health::HealthReporter;
296    ///
297    /// let r = HealthReporter::new();
298    /// assert!(r.report().components.is_empty());
299    /// ```
300    pub fn new() -> Self {
301        Self {
302            components: RwLock::new(HashMap::new()),
303        }
304    }
305
306    /// Register or update a component's status by name.
307    ///
308    /// # Example
309    ///
310    /// ```
311    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
312    ///
313    /// let mut r = HealthReporter::new();
314    /// r.register("db", HealthStatus::Healthy);
315    /// assert_eq!(r.report().components.len(), 1);
316    /// ```
317    pub fn register(&self, name: impl Into<String>, status: HealthStatus) {
318        let name = name.into();
319        let component = ComponentHealth {
320            name: name.clone(),
321            status,
322            details: None,
323        };
324        self.components.write().insert(name, component);
325    }
326
327    /// Register or update a component with full [`ComponentHealth`].
328    ///
329    /// # Example
330    ///
331    /// ```
332    /// use stygian_graph::application::health::{HealthReporter, ComponentHealth};
333    ///
334    /// let mut r = HealthReporter::new();
335    /// r.register_component(ComponentHealth::healthy("cache"));
336    /// assert_eq!(r.report().components.len(), 1);
337    /// ```
338    pub fn register_component(&self, component: ComponentHealth) {
339        self.components
340            .write()
341            .insert(component.name.clone(), component);
342    }
343
344    /// Remove a component from reporting.
345    ///
346    /// # Example
347    ///
348    /// ```
349    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
350    ///
351    /// let mut r = HealthReporter::new();
352    /// r.register("db", HealthStatus::Healthy);
353    /// r.deregister("db");
354    /// assert!(r.report().components.is_empty());
355    /// ```
356    pub fn deregister(&self, name: &str) {
357        self.components.write().remove(name);
358    }
359
360    /// Generate a [`HealthReport`] from current component states.
361    ///
362    /// # Example
363    ///
364    /// ```
365    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
366    ///
367    /// let r = HealthReporter::new();
368    /// let report = r.report();
369    /// assert_eq!(report.overall, HealthStatus::Healthy);
370    /// assert!(report.is_live());
371    /// ```
372    pub fn report(&self) -> HealthReport {
373        let components: Vec<ComponentHealth> = self.components.read().values().cloned().collect();
374
375        let overall = aggregate_status(&components);
376        HealthReport {
377            overall,
378            components,
379            checked_at: SystemTime::now(),
380        }
381    }
382}
383
384impl Default for HealthReporter {
385    fn default() -> Self {
386        Self::new()
387    }
388}
389
390fn aggregate_status(components: &[ComponentHealth]) -> HealthStatus {
391    let mut worst = HealthStatus::Healthy;
392    for c in components {
393        match &c.status {
394            HealthStatus::Unhealthy(r) => {
395                return HealthStatus::Unhealthy(r.clone());
396            }
397            HealthStatus::Degraded(r) => {
398                if worst == HealthStatus::Healthy {
399                    worst = HealthStatus::Degraded(r.clone());
400                }
401            }
402            HealthStatus::Healthy => {}
403        }
404    }
405    worst
406}
407
408// ─── Tests ────────────────────────────────────────────────────────────────────
409
410#[cfg(test)]
411#[allow(clippy::unwrap_used, clippy::expect_used)]
412mod tests {
413    use super::*;
414
415    #[test]
416    fn healthy_status_is_healthy() {
417        assert!(HealthStatus::Healthy.is_healthy());
418        assert!(HealthStatus::Healthy.is_operational());
419    }
420
421    #[test]
422    fn degraded_status_is_not_healthy_but_operational() {
423        let s = HealthStatus::Degraded("reason".into());
424        assert!(!s.is_healthy());
425        assert!(s.is_operational());
426    }
427
428    #[test]
429    fn unhealthy_status_is_not_operational() {
430        let s = HealthStatus::Unhealthy("crashed".into());
431        assert!(!s.is_healthy());
432        assert!(!s.is_operational());
433    }
434
435    #[test]
436    fn empty_reporter_overall_is_healthy() {
437        let reporter = HealthReporter::new();
438        assert_eq!(reporter.report().overall, HealthStatus::Healthy);
439    }
440
441    #[test]
442    fn all_healthy_report_is_ready_and_live() {
443        let r = HealthReporter::new();
444        r.register("db", HealthStatus::Healthy);
445        r.register("cache", HealthStatus::Healthy);
446        let report = r.report();
447        assert!(report.is_ready());
448        assert!(report.is_live());
449        assert_eq!(report.http_status_code(), 200);
450    }
451
452    #[test]
453    fn degraded_component_report_not_ready_but_still_live() {
454        let r = HealthReporter::new();
455        r.register("db", HealthStatus::Healthy);
456        r.register("cache", HealthStatus::Degraded("high latency".into()));
457        let report = r.report();
458        // Degraded is operational so is_ready returns true
459        assert!(report.is_ready());
460        assert!(report.is_live());
461    }
462
463    #[test]
464    fn unhealthy_component_makes_report_not_ready() {
465        let r = HealthReporter::new();
466        r.register("db", HealthStatus::Unhealthy("connection refused".into()));
467        let report = r.report();
468        assert!(!report.is_ready());
469        assert_eq!(report.http_status_code(), 503);
470    }
471
472    #[test]
473    fn all_unhealthy_not_live() {
474        let r = HealthReporter::new();
475        r.register("a", HealthStatus::Unhealthy("x".into()));
476        r.register("b", HealthStatus::Unhealthy("y".into()));
477        assert!(!r.report().is_live());
478    }
479
480    #[test]
481    fn register_same_component_updates_status() {
482        let r = HealthReporter::new();
483        r.register("db", HealthStatus::Healthy);
484        r.register("db", HealthStatus::Unhealthy("disk full".into()));
485        let report = r.report();
486        assert_eq!(report.components.len(), 1);
487        assert!(!report.is_ready());
488    }
489
490    #[test]
491    fn deregister_removes_component() {
492        let r = HealthReporter::new();
493        r.register("db", HealthStatus::Healthy);
494        r.deregister("db");
495        assert!(r.report().components.is_empty());
496    }
497
498    #[test]
499    fn component_health_builders() {
500        assert!(ComponentHealth::healthy("x").status.is_healthy());
501        assert!(
502            ComponentHealth::degraded("x", "slow")
503                .status
504                .is_operational()
505        );
506        assert!(
507            !ComponentHealth::unhealthy("x", "down")
508                .status
509                .is_operational()
510        );
511    }
512
513    #[test]
514    fn component_with_details_serializes() {
515        let c = ComponentHealth::healthy("pool").with_details(serde_json::json!({ "idle": 8 }));
516        assert!(c.details.is_some());
517        let json = serde_json::to_string(&c).unwrap();
518        assert!(json.contains("idle"));
519    }
520
521    #[test]
522    fn health_report_serializes_to_json() {
523        let r = HealthReporter::new();
524        r.register("db", HealthStatus::Healthy);
525        let report = r.report();
526        let json = serde_json::to_string(&report).expect("serialize");
527        assert!(json.contains("healthy"));
528    }
529
530    #[test]
531    fn aggregate_status_worst_wins() {
532        let components = vec![
533            ComponentHealth::healthy("a"),
534            ComponentHealth::degraded("b", "slow"),
535            ComponentHealth::unhealthy("c", "down"),
536        ];
537        let status = aggregate_status(&components);
538        assert!(matches!(status, HealthStatus::Unhealthy(_)));
539    }
540}