Skip to main content

stygian_graph/application/
health.rs

1//! Health check reporting for Kubernetes liveness and readiness probes.
2//!
3//! Provides structured health-check types and a [`HealthReporter`](health::HealthReporter) for aggregating
4//! component-level health into an overall [`HealthReport`](health::HealthReport).
5//!
6//! # Example
7//!
8//! ```
9//! use stygian_graph::application::health::{HealthReporter, HealthStatus, ComponentHealth};
10//!
11//! let mut reporter = HealthReporter::new();
12//! reporter.register("database", HealthStatus::Healthy);
13//! reporter.register("cache", HealthStatus::Degraded("High latency".to_string()));
14//!
15//! let report = reporter.report();
16//! assert!(report.is_ready());  // Degraded is still operational ⇒ ready
17//! assert!(report.is_live());   // Still alive while degraded
18//! ```
19
20use std::collections::HashMap;
21use std::time::SystemTime;
22
23use parking_lot::RwLock;
24
25use serde::{Deserialize, Serialize};
26
27// ─── HealthStatus ─────────────────────────────────────────────────────────────
28
29/// The health status of a single component.
30#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
31#[serde(tag = "status", content = "reason", rename_all = "lowercase")]
32pub enum HealthStatus {
33    /// Component is operating normally.
34    Healthy,
35    /// Component is partially impaired but still serving requests.
36    Degraded(String),
37    /// Component is non-functional; requests will fail.
38    Unhealthy(String),
39}
40
41impl HealthStatus {
42    /// Returns `true` only when the component is fully healthy.
43    ///
44    /// # Example
45    ///
46    /// ```
47    /// use stygian_graph::application::health::HealthStatus;
48    /// assert!(HealthStatus::Healthy.is_healthy());
49    /// assert!(!HealthStatus::Degraded("latency".into()).is_healthy());
50    /// ```
51    #[must_use]
52    pub const fn is_healthy(&self) -> bool {
53        matches!(self, Self::Healthy)
54    }
55
56    /// Returns `true` when the component can still serve requests (healthy or degraded).
57    ///
58    /// # Example
59    ///
60    /// ```
61    /// use stygian_graph::application::health::HealthStatus;
62    /// assert!(HealthStatus::Healthy.is_operational());
63    /// assert!(HealthStatus::Degraded("high latency".into()).is_operational());
64    /// assert!(!HealthStatus::Unhealthy("connection refused".into()).is_operational());
65    /// ```
66    #[must_use]
67    pub const fn is_operational(&self) -> bool {
68        !matches!(self, Self::Unhealthy(_))
69    }
70}
71
72impl std::fmt::Display for HealthStatus {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        match self {
75            Self::Healthy => write!(f, "healthy"),
76            Self::Degraded(r) => write!(f, "degraded: {r}"),
77            Self::Unhealthy(r) => write!(f, "unhealthy: {r}"),
78        }
79    }
80}
81
82// ─── ComponentHealth ─────────────────────────────────────────────────────────
83
84/// Health state for a single named component.
85///
86/// Returned as part of a [`HealthReport`].
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct ComponentHealth {
89    /// Component identifier (e.g. `"database"`, `"cache"`, `"worker_pool"`)
90    pub name: String,
91    /// Component status
92    pub status: HealthStatus,
93    /// Optional free-form details (timings, error messages, etc.)
94    #[serde(skip_serializing_if = "Option::is_none")]
95    pub details: Option<serde_json::Value>,
96}
97
98impl ComponentHealth {
99    /// Create a healthy component with no extra details.
100    ///
101    /// # Example
102    ///
103    /// ```
104    /// use stygian_graph::application::health::{ComponentHealth, HealthStatus};
105    ///
106    /// let c = ComponentHealth::healthy("cache");
107    /// assert_eq!(c.status, HealthStatus::Healthy);
108    /// ```
109    pub fn healthy(name: impl Into<String>) -> Self {
110        Self {
111            name: name.into(),
112            status: HealthStatus::Healthy,
113            details: None,
114        }
115    }
116
117    /// Create a degraded component.
118    ///
119    /// # Example
120    ///
121    /// ```
122    /// use stygian_graph::application::health::ComponentHealth;
123    ///
124    /// let c = ComponentHealth::degraded("database", "replication lag 5s");
125    /// assert!(!c.status.is_healthy());
126    /// ```
127    pub fn degraded(name: impl Into<String>, reason: impl Into<String>) -> Self {
128        Self {
129            name: name.into(),
130            status: HealthStatus::Degraded(reason.into()),
131            details: None,
132        }
133    }
134
135    /// Create an unhealthy component.
136    ///
137    /// # Example
138    ///
139    /// ```
140    /// use stygian_graph::application::health::ComponentHealth;
141    ///
142    /// let c = ComponentHealth::unhealthy("valkey", "connection refused");
143    /// assert!(!c.status.is_operational());
144    /// ```
145    pub fn unhealthy(name: impl Into<String>, reason: impl Into<String>) -> Self {
146        Self {
147            name: name.into(),
148            status: HealthStatus::Unhealthy(reason.into()),
149            details: None,
150        }
151    }
152
153    /// Attach arbitrary JSON details to this component.
154    ///
155    /// # Example
156    ///
157    /// ```
158    /// use stygian_graph::application::health::ComponentHealth;
159    ///
160    /// let c = ComponentHealth::healthy("http_pool")
161    ///     .with_details(serde_json::json!({ "idle_connections": 8, "max": 32 }));
162    ///
163    /// assert!(c.details.is_some());
164    /// ```
165    #[must_use]
166    pub fn with_details(mut self, details: serde_json::Value) -> Self {
167        self.details = Some(details);
168        self
169    }
170}
171
172// ─── HealthReport ────────────────────────────────────────────────────────────
173
174/// Aggregated health report for all registered components.
175///
176/// Returned by [`HealthReporter::report`].
177#[derive(Debug, Clone, Serialize, Deserialize)]
178pub struct HealthReport {
179    /// Overall system status (worst of all components)
180    pub overall: HealthStatus,
181    /// Per-component breakdown
182    pub components: Vec<ComponentHealth>,
183    /// When this report was generated (Unix seconds)
184    #[serde(with = "system_time_serde")]
185    pub checked_at: SystemTime,
186}
187
188impl HealthReport {
189    /// Returns `true` when the system is ready to serve traffic.
190    ///
191    /// The system is ready only when **all** components are healthy or degraded
192    /// (Kubernetes readiness probe).
193    ///
194    /// # Example
195    ///
196    /// ```
197    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
198    ///
199    /// let mut r = HealthReporter::new();
200    /// r.register("db", HealthStatus::Healthy);
201    /// assert!(r.report().is_ready());
202    /// ```
203    #[must_use]
204    pub fn is_ready(&self) -> bool {
205        self.components.iter().all(|c| c.status.is_operational())
206    }
207
208    /// Returns `true` while the process should continue running.
209    ///
210    /// The process is considered alive unless every component is unhealthy
211    /// (Kubernetes liveness probe).
212    ///
213    /// # Example
214    ///
215    /// ```
216    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
217    ///
218    /// let r = HealthReporter::new();
219    /// r.register("db", HealthStatus::Unhealthy("disk full".into()));
220    /// r.register("cache", HealthStatus::Healthy);
221    /// // One unhealthy component doesn't kill the process while others are healthy
222    /// assert!(r.report().is_live());
223    /// ```
224    #[must_use]
225    pub fn is_live(&self) -> bool {
226        // Dead when ALL components are unhealthy (or no components registered)
227        if self.components.is_empty() {
228            return true;
229        }
230        self.components.iter().any(|c| c.status.is_operational())
231    }
232
233    /// HTTP status code suitable for a health-check endpoint.
234    ///
235    /// Returns `200` when ready, `503` when not.
236    ///
237    /// # Example
238    ///
239    /// ```
240    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
241    ///
242    /// let mut r = HealthReporter::new();
243    /// r.register("db", HealthStatus::Healthy);
244    /// assert_eq!(r.report().http_status_code(), 200u16);
245    /// ```
246    #[must_use]
247    pub fn http_status_code(&self) -> u16 {
248        if self.is_ready() { 200 } else { 503 }
249    }
250}
251
252// ─── System-time serde helper ─────────────────────────────────────────────────
253
254mod system_time_serde {
255    use serde::{Deserialize, Deserializer, Serializer};
256    use std::time::{SystemTime, UNIX_EPOCH};
257
258    pub fn serialize<S: Serializer>(t: &SystemTime, s: S) -> Result<S::Ok, S::Error> {
259        let secs = t.duration_since(UNIX_EPOCH).unwrap_or_default().as_secs();
260        s.serialize_u64(secs)
261    }
262
263    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<SystemTime, D::Error> {
264        let secs = u64::deserialize(d)?;
265        Ok(UNIX_EPOCH + std::time::Duration::from_secs(secs))
266    }
267}
268
269// ─── HealthReporter ──────────────────────────────────────────────────────────
270
271/// Collects component-level health checks and produces a [`HealthReport`].
272///
273/// Thread-safe; cheaply cloneable via `Arc` patterns.
274///
275/// # Example
276///
277/// ```
278/// use stygian_graph::application::health::{HealthReporter, HealthStatus, ComponentHealth};
279///
280/// let mut reporter = HealthReporter::new();
281/// reporter.register("database", HealthStatus::Healthy);
282/// reporter.register_component(
283///     ComponentHealth::degraded("cache", "latency p99 > 100ms")
284///         .with_details(serde_json::json!({ "p99_ms": 142 }))
285/// );
286///
287/// let report = reporter.report();
288/// assert_eq!(report.http_status_code(), 200u16); // degraded is operational
289/// ```
290pub struct HealthReporter {
291    components: RwLock<HashMap<String, ComponentHealth>>,
292}
293
294impl HealthReporter {
295    /// Create an empty reporter.
296    ///
297    /// # Example
298    ///
299    /// ```
300    /// use stygian_graph::application::health::HealthReporter;
301    ///
302    /// let r = HealthReporter::new();
303    /// assert!(r.report().components.is_empty());
304    /// ```
305    #[must_use]
306    pub fn new() -> Self {
307        Self {
308            components: RwLock::new(HashMap::new()),
309        }
310    }
311
312    /// Register or update a component's status by name.
313    ///
314    /// # Example
315    ///
316    /// ```
317    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
318    ///
319    /// let mut r = HealthReporter::new();
320    /// r.register("db", HealthStatus::Healthy);
321    /// assert_eq!(r.report().components.len(), 1);
322    /// ```
323    pub fn register(&self, name: impl Into<String>, status: HealthStatus) {
324        let name = name.into();
325        let component = ComponentHealth {
326            name: name.clone(),
327            status,
328            details: None,
329        };
330        self.components.write().insert(name, component);
331    }
332
333    /// Register or update a component with full [`ComponentHealth`].
334    ///
335    /// # Example
336    ///
337    /// ```
338    /// use stygian_graph::application::health::{HealthReporter, ComponentHealth};
339    ///
340    /// let mut r = HealthReporter::new();
341    /// r.register_component(ComponentHealth::healthy("cache"));
342    /// assert_eq!(r.report().components.len(), 1);
343    /// ```
344    pub fn register_component(&self, component: ComponentHealth) {
345        self.components
346            .write()
347            .insert(component.name.clone(), component);
348    }
349
350    /// Remove a component from reporting.
351    ///
352    /// # Example
353    ///
354    /// ```
355    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
356    ///
357    /// let mut r = HealthReporter::new();
358    /// r.register("db", HealthStatus::Healthy);
359    /// r.deregister("db");
360    /// assert!(r.report().components.is_empty());
361    /// ```
362    pub fn deregister(&self, name: &str) {
363        self.components.write().remove(name);
364    }
365
366    /// Generate a [`HealthReport`] from current component states.
367    ///
368    /// # Example
369    ///
370    /// ```
371    /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
372    ///
373    /// let r = HealthReporter::new();
374    /// let report = r.report();
375    /// assert_eq!(report.overall, HealthStatus::Healthy);
376    /// assert!(report.is_live());
377    /// ```
378    pub fn report(&self) -> HealthReport {
379        let components: Vec<ComponentHealth> = self.components.read().values().cloned().collect();
380
381        let overall = aggregate_status(&components);
382        HealthReport {
383            overall,
384            components,
385            checked_at: SystemTime::now(),
386        }
387    }
388}
389
390impl Default for HealthReporter {
391    fn default() -> Self {
392        Self::new()
393    }
394}
395
396fn aggregate_status(components: &[ComponentHealth]) -> HealthStatus {
397    let mut worst = HealthStatus::Healthy;
398    for c in components {
399        match &c.status {
400            HealthStatus::Unhealthy(r) => {
401                return HealthStatus::Unhealthy(r.clone());
402            }
403            HealthStatus::Degraded(r) => {
404                if worst == HealthStatus::Healthy {
405                    worst = HealthStatus::Degraded(r.clone());
406                }
407            }
408            HealthStatus::Healthy => {}
409        }
410    }
411    worst
412}
413
414// ─── Tests ────────────────────────────────────────────────────────────────────
415
416#[cfg(test)]
417#[allow(clippy::unwrap_used, clippy::expect_used)]
418mod tests {
419    use super::*;
420
421    #[test]
422    fn healthy_status_is_healthy() {
423        assert!(HealthStatus::Healthy.is_healthy());
424        assert!(HealthStatus::Healthy.is_operational());
425    }
426
427    #[test]
428    fn degraded_status_is_not_healthy_but_operational() {
429        let s = HealthStatus::Degraded("reason".into());
430        assert!(!s.is_healthy());
431        assert!(s.is_operational());
432    }
433
434    #[test]
435    fn unhealthy_status_is_not_operational() {
436        let s = HealthStatus::Unhealthy("crashed".into());
437        assert!(!s.is_healthy());
438        assert!(!s.is_operational());
439    }
440
441    #[test]
442    fn empty_reporter_overall_is_healthy() {
443        let reporter = HealthReporter::new();
444        assert_eq!(reporter.report().overall, HealthStatus::Healthy);
445    }
446
447    #[test]
448    fn all_healthy_report_is_ready_and_live() {
449        let r = HealthReporter::new();
450        r.register("db", HealthStatus::Healthy);
451        r.register("cache", HealthStatus::Healthy);
452        let report = r.report();
453        assert!(report.is_ready());
454        assert!(report.is_live());
455        assert_eq!(report.http_status_code(), 200);
456    }
457
458    #[test]
459    fn degraded_component_report_not_ready_but_still_live() {
460        let r = HealthReporter::new();
461        r.register("db", HealthStatus::Healthy);
462        r.register("cache", HealthStatus::Degraded("high latency".into()));
463        let report = r.report();
464        // Degraded is operational so is_ready returns true
465        assert!(report.is_ready());
466        assert!(report.is_live());
467    }
468
469    #[test]
470    fn unhealthy_component_makes_report_not_ready() {
471        let r = HealthReporter::new();
472        r.register("db", HealthStatus::Unhealthy("connection refused".into()));
473        let report = r.report();
474        assert!(!report.is_ready());
475        assert_eq!(report.http_status_code(), 503);
476    }
477
478    #[test]
479    fn all_unhealthy_not_live() {
480        let r = HealthReporter::new();
481        r.register("a", HealthStatus::Unhealthy("x".into()));
482        r.register("b", HealthStatus::Unhealthy("y".into()));
483        assert!(!r.report().is_live());
484    }
485
486    #[test]
487    fn register_same_component_updates_status() {
488        let r = HealthReporter::new();
489        r.register("db", HealthStatus::Healthy);
490        r.register("db", HealthStatus::Unhealthy("disk full".into()));
491        let report = r.report();
492        assert_eq!(report.components.len(), 1);
493        assert!(!report.is_ready());
494    }
495
496    #[test]
497    fn deregister_removes_component() {
498        let r = HealthReporter::new();
499        r.register("db", HealthStatus::Healthy);
500        r.deregister("db");
501        assert!(r.report().components.is_empty());
502    }
503
504    #[test]
505    fn component_health_builders() {
506        assert!(ComponentHealth::healthy("x").status.is_healthy());
507        assert!(
508            ComponentHealth::degraded("x", "slow")
509                .status
510                .is_operational()
511        );
512        assert!(
513            !ComponentHealth::unhealthy("x", "down")
514                .status
515                .is_operational()
516        );
517    }
518
519    #[test]
520    fn component_with_details_serializes() {
521        let c = ComponentHealth::healthy("pool").with_details(serde_json::json!({ "idle": 8 }));
522        assert!(c.details.is_some());
523        let json = serde_json::to_string(&c).unwrap();
524        assert!(json.contains("idle"));
525    }
526
527    #[test]
528    fn health_report_serializes_to_json() {
529        let r = HealthReporter::new();
530        r.register("db", HealthStatus::Healthy);
531        let report = r.report();
532        let json = serde_json::to_string(&report).expect("serialize");
533        assert!(json.contains("healthy"));
534    }
535
536    #[test]
537    fn aggregate_status_worst_wins() {
538        let components = vec![
539            ComponentHealth::healthy("a"),
540            ComponentHealth::degraded("b", "slow"),
541            ComponentHealth::unhealthy("c", "down"),
542        ];
543        let status = aggregate_status(&components);
544        assert!(matches!(status, HealthStatus::Unhealthy(_)));
545    }
546}