stygian_graph/application/health.rs
1//! Health check reporting for Kubernetes liveness and readiness probes.
2//!
3//! Provides structured health-check types and a [`HealthReporter`](health::HealthReporter) for aggregating
4//! component-level health into an overall [`HealthReport`](health::HealthReport).
5//!
6//! # Example
7//!
8//! ```
9//! use stygian_graph::application::health::{HealthReporter, HealthStatus, ComponentHealth};
10//!
11//! let mut reporter = HealthReporter::new();
12//! reporter.register("database", HealthStatus::Healthy);
13//! reporter.register("cache", HealthStatus::Degraded("High latency".to_string()));
14//!
15//! let report = reporter.report();
16//! assert!(report.is_ready()); // Degraded is still operational ⇒ ready
17//! assert!(report.is_live()); // Still alive while degraded
18//! ```
19
20use std::collections::HashMap;
21use std::time::SystemTime;
22
23use parking_lot::RwLock;
24
25use serde::{Deserialize, Serialize};
26
27// ─── HealthStatus ─────────────────────────────────────────────────────────────
28
29/// The health status of a single component.
30#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
31#[serde(tag = "status", content = "reason", rename_all = "lowercase")]
32pub enum HealthStatus {
33 /// Component is operating normally.
34 Healthy,
35 /// Component is partially impaired but still serving requests.
36 Degraded(String),
37 /// Component is non-functional; requests will fail.
38 Unhealthy(String),
39}
40
41impl HealthStatus {
42 /// Returns `true` only when the component is fully healthy.
43 ///
44 /// # Example
45 ///
46 /// ```
47 /// use stygian_graph::application::health::HealthStatus;
48 /// assert!(HealthStatus::Healthy.is_healthy());
49 /// assert!(!HealthStatus::Degraded("latency".into()).is_healthy());
50 /// ```
51 pub const fn is_healthy(&self) -> bool {
52 matches!(self, Self::Healthy)
53 }
54
55 /// Returns `true` when the component can still serve requests (healthy or degraded).
56 ///
57 /// # Example
58 ///
59 /// ```
60 /// use stygian_graph::application::health::HealthStatus;
61 /// assert!(HealthStatus::Healthy.is_operational());
62 /// assert!(HealthStatus::Degraded("high latency".into()).is_operational());
63 /// assert!(!HealthStatus::Unhealthy("connection refused".into()).is_operational());
64 /// ```
65 pub const fn is_operational(&self) -> bool {
66 !matches!(self, Self::Unhealthy(_))
67 }
68}
69
70impl std::fmt::Display for HealthStatus {
71 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72 match self {
73 Self::Healthy => write!(f, "healthy"),
74 Self::Degraded(r) => write!(f, "degraded: {r}"),
75 Self::Unhealthy(r) => write!(f, "unhealthy: {r}"),
76 }
77 }
78}
79
80// ─── ComponentHealth ─────────────────────────────────────────────────────────
81
82/// Health state for a single named component.
83///
84/// Returned as part of a [`HealthReport`].
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct ComponentHealth {
87 /// Component identifier (e.g. `"database"`, `"cache"`, `"worker_pool"`)
88 pub name: String,
89 /// Component status
90 pub status: HealthStatus,
91 /// Optional free-form details (timings, error messages, etc.)
92 #[serde(skip_serializing_if = "Option::is_none")]
93 pub details: Option<serde_json::Value>,
94}
95
96impl ComponentHealth {
97 /// Create a healthy component with no extra details.
98 ///
99 /// # Example
100 ///
101 /// ```
102 /// use stygian_graph::application::health::{ComponentHealth, HealthStatus};
103 ///
104 /// let c = ComponentHealth::healthy("cache");
105 /// assert_eq!(c.status, HealthStatus::Healthy);
106 /// ```
107 pub fn healthy(name: impl Into<String>) -> Self {
108 Self {
109 name: name.into(),
110 status: HealthStatus::Healthy,
111 details: None,
112 }
113 }
114
115 /// Create a degraded component.
116 ///
117 /// # Example
118 ///
119 /// ```
120 /// use stygian_graph::application::health::ComponentHealth;
121 ///
122 /// let c = ComponentHealth::degraded("database", "replication lag 5s");
123 /// assert!(!c.status.is_healthy());
124 /// ```
125 pub fn degraded(name: impl Into<String>, reason: impl Into<String>) -> Self {
126 Self {
127 name: name.into(),
128 status: HealthStatus::Degraded(reason.into()),
129 details: None,
130 }
131 }
132
133 /// Create an unhealthy component.
134 ///
135 /// # Example
136 ///
137 /// ```
138 /// use stygian_graph::application::health::ComponentHealth;
139 ///
140 /// let c = ComponentHealth::unhealthy("valkey", "connection refused");
141 /// assert!(!c.status.is_operational());
142 /// ```
143 pub fn unhealthy(name: impl Into<String>, reason: impl Into<String>) -> Self {
144 Self {
145 name: name.into(),
146 status: HealthStatus::Unhealthy(reason.into()),
147 details: None,
148 }
149 }
150
151 /// Attach arbitrary JSON details to this component.
152 ///
153 /// # Example
154 ///
155 /// ```
156 /// use stygian_graph::application::health::ComponentHealth;
157 ///
158 /// let c = ComponentHealth::healthy("http_pool")
159 /// .with_details(serde_json::json!({ "idle_connections": 8, "max": 32 }));
160 ///
161 /// assert!(c.details.is_some());
162 /// ```
163 #[must_use]
164 pub fn with_details(mut self, details: serde_json::Value) -> Self {
165 self.details = Some(details);
166 self
167 }
168}
169
170// ─── HealthReport ────────────────────────────────────────────────────────────
171
172/// Aggregated health report for all registered components.
173///
174/// Returned by [`HealthReporter::report`].
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct HealthReport {
177 /// Overall system status (worst of all components)
178 pub overall: HealthStatus,
179 /// Per-component breakdown
180 pub components: Vec<ComponentHealth>,
181 /// When this report was generated (Unix seconds)
182 #[serde(with = "system_time_serde")]
183 pub checked_at: SystemTime,
184}
185
186impl HealthReport {
187 /// Returns `true` when the system is ready to serve traffic.
188 ///
189 /// The system is ready only when **all** components are healthy or degraded
190 /// (Kubernetes readiness probe).
191 ///
192 /// # Example
193 ///
194 /// ```
195 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
196 ///
197 /// let mut r = HealthReporter::new();
198 /// r.register("db", HealthStatus::Healthy);
199 /// assert!(r.report().is_ready());
200 /// ```
201 pub fn is_ready(&self) -> bool {
202 self.components.iter().all(|c| c.status.is_operational())
203 }
204
205 /// Returns `true` while the process should continue running.
206 ///
207 /// The process is considered alive unless every component is unhealthy
208 /// (Kubernetes liveness probe).
209 ///
210 /// # Example
211 ///
212 /// ```
213 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
214 ///
215 /// let r = HealthReporter::new();
216 /// r.register("db", HealthStatus::Unhealthy("disk full".into()));
217 /// r.register("cache", HealthStatus::Healthy);
218 /// // One unhealthy component doesn't kill the process while others are healthy
219 /// assert!(r.report().is_live());
220 /// ```
221 pub fn is_live(&self) -> bool {
222 // Dead when ALL components are unhealthy (or no components registered)
223 if self.components.is_empty() {
224 return true;
225 }
226 self.components.iter().any(|c| c.status.is_operational())
227 }
228
229 /// HTTP status code suitable for a health-check endpoint.
230 ///
231 /// Returns `200` when ready, `503` when not.
232 ///
233 /// # Example
234 ///
235 /// ```
236 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
237 ///
238 /// let mut r = HealthReporter::new();
239 /// r.register("db", HealthStatus::Healthy);
240 /// assert_eq!(r.report().http_status_code(), 200u16);
241 /// ```
242 pub fn http_status_code(&self) -> u16 {
243 if self.is_ready() { 200 } else { 503 }
244 }
245}
246
247// ─── System-time serde helper ─────────────────────────────────────────────────
248
249mod system_time_serde {
250 use serde::{Deserialize, Deserializer, Serializer};
251 use std::time::{SystemTime, UNIX_EPOCH};
252
253 pub fn serialize<S: Serializer>(t: &SystemTime, s: S) -> Result<S::Ok, S::Error> {
254 let secs = t.duration_since(UNIX_EPOCH).unwrap_or_default().as_secs();
255 s.serialize_u64(secs)
256 }
257
258 pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<SystemTime, D::Error> {
259 let secs = u64::deserialize(d)?;
260 Ok(UNIX_EPOCH + std::time::Duration::from_secs(secs))
261 }
262}
263
264// ─── HealthReporter ──────────────────────────────────────────────────────────
265
266/// Collects component-level health checks and produces a [`HealthReport`].
267///
268/// Thread-safe; cheaply cloneable via `Arc` patterns.
269///
270/// # Example
271///
272/// ```
273/// use stygian_graph::application::health::{HealthReporter, HealthStatus, ComponentHealth};
274///
275/// let mut reporter = HealthReporter::new();
276/// reporter.register("database", HealthStatus::Healthy);
277/// reporter.register_component(
278/// ComponentHealth::degraded("cache", "latency p99 > 100ms")
279/// .with_details(serde_json::json!({ "p99_ms": 142 }))
280/// );
281///
282/// let report = reporter.report();
283/// assert_eq!(report.http_status_code(), 200u16); // degraded is operational
284/// ```
285pub struct HealthReporter {
286 components: RwLock<HashMap<String, ComponentHealth>>,
287}
288
289impl HealthReporter {
290 /// Create an empty reporter.
291 ///
292 /// # Example
293 ///
294 /// ```
295 /// use stygian_graph::application::health::HealthReporter;
296 ///
297 /// let r = HealthReporter::new();
298 /// assert!(r.report().components.is_empty());
299 /// ```
300 pub fn new() -> Self {
301 Self {
302 components: RwLock::new(HashMap::new()),
303 }
304 }
305
306 /// Register or update a component's status by name.
307 ///
308 /// # Example
309 ///
310 /// ```
311 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
312 ///
313 /// let mut r = HealthReporter::new();
314 /// r.register("db", HealthStatus::Healthy);
315 /// assert_eq!(r.report().components.len(), 1);
316 /// ```
317 pub fn register(&self, name: impl Into<String>, status: HealthStatus) {
318 let name = name.into();
319 let component = ComponentHealth {
320 name: name.clone(),
321 status,
322 details: None,
323 };
324 self.components.write().insert(name, component);
325 }
326
327 /// Register or update a component with full [`ComponentHealth`].
328 ///
329 /// # Example
330 ///
331 /// ```
332 /// use stygian_graph::application::health::{HealthReporter, ComponentHealth};
333 ///
334 /// let mut r = HealthReporter::new();
335 /// r.register_component(ComponentHealth::healthy("cache"));
336 /// assert_eq!(r.report().components.len(), 1);
337 /// ```
338 pub fn register_component(&self, component: ComponentHealth) {
339 self.components
340 .write()
341 .insert(component.name.clone(), component);
342 }
343
344 /// Remove a component from reporting.
345 ///
346 /// # Example
347 ///
348 /// ```
349 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
350 ///
351 /// let mut r = HealthReporter::new();
352 /// r.register("db", HealthStatus::Healthy);
353 /// r.deregister("db");
354 /// assert!(r.report().components.is_empty());
355 /// ```
356 pub fn deregister(&self, name: &str) {
357 self.components.write().remove(name);
358 }
359
360 /// Generate a [`HealthReport`] from current component states.
361 ///
362 /// # Example
363 ///
364 /// ```
365 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
366 ///
367 /// let r = HealthReporter::new();
368 /// let report = r.report();
369 /// assert_eq!(report.overall, HealthStatus::Healthy);
370 /// assert!(report.is_live());
371 /// ```
372 pub fn report(&self) -> HealthReport {
373 let components: Vec<ComponentHealth> = self.components.read().values().cloned().collect();
374
375 let overall = aggregate_status(&components);
376 HealthReport {
377 overall,
378 components,
379 checked_at: SystemTime::now(),
380 }
381 }
382}
383
384impl Default for HealthReporter {
385 fn default() -> Self {
386 Self::new()
387 }
388}
389
390fn aggregate_status(components: &[ComponentHealth]) -> HealthStatus {
391 let mut worst = HealthStatus::Healthy;
392 for c in components {
393 match &c.status {
394 HealthStatus::Unhealthy(r) => {
395 return HealthStatus::Unhealthy(r.clone());
396 }
397 HealthStatus::Degraded(r) => {
398 if worst == HealthStatus::Healthy {
399 worst = HealthStatus::Degraded(r.clone());
400 }
401 }
402 HealthStatus::Healthy => {}
403 }
404 }
405 worst
406}
407
408// ─── Tests ────────────────────────────────────────────────────────────────────
409
410#[cfg(test)]
411#[allow(clippy::unwrap_used, clippy::expect_used)]
412mod tests {
413 use super::*;
414
415 #[test]
416 fn healthy_status_is_healthy() {
417 assert!(HealthStatus::Healthy.is_healthy());
418 assert!(HealthStatus::Healthy.is_operational());
419 }
420
421 #[test]
422 fn degraded_status_is_not_healthy_but_operational() {
423 let s = HealthStatus::Degraded("reason".into());
424 assert!(!s.is_healthy());
425 assert!(s.is_operational());
426 }
427
428 #[test]
429 fn unhealthy_status_is_not_operational() {
430 let s = HealthStatus::Unhealthy("crashed".into());
431 assert!(!s.is_healthy());
432 assert!(!s.is_operational());
433 }
434
435 #[test]
436 fn empty_reporter_overall_is_healthy() {
437 let reporter = HealthReporter::new();
438 assert_eq!(reporter.report().overall, HealthStatus::Healthy);
439 }
440
441 #[test]
442 fn all_healthy_report_is_ready_and_live() {
443 let r = HealthReporter::new();
444 r.register("db", HealthStatus::Healthy);
445 r.register("cache", HealthStatus::Healthy);
446 let report = r.report();
447 assert!(report.is_ready());
448 assert!(report.is_live());
449 assert_eq!(report.http_status_code(), 200);
450 }
451
452 #[test]
453 fn degraded_component_report_not_ready_but_still_live() {
454 let r = HealthReporter::new();
455 r.register("db", HealthStatus::Healthy);
456 r.register("cache", HealthStatus::Degraded("high latency".into()));
457 let report = r.report();
458 // Degraded is operational so is_ready returns true
459 assert!(report.is_ready());
460 assert!(report.is_live());
461 }
462
463 #[test]
464 fn unhealthy_component_makes_report_not_ready() {
465 let r = HealthReporter::new();
466 r.register("db", HealthStatus::Unhealthy("connection refused".into()));
467 let report = r.report();
468 assert!(!report.is_ready());
469 assert_eq!(report.http_status_code(), 503);
470 }
471
472 #[test]
473 fn all_unhealthy_not_live() {
474 let r = HealthReporter::new();
475 r.register("a", HealthStatus::Unhealthy("x".into()));
476 r.register("b", HealthStatus::Unhealthy("y".into()));
477 assert!(!r.report().is_live());
478 }
479
480 #[test]
481 fn register_same_component_updates_status() {
482 let r = HealthReporter::new();
483 r.register("db", HealthStatus::Healthy);
484 r.register("db", HealthStatus::Unhealthy("disk full".into()));
485 let report = r.report();
486 assert_eq!(report.components.len(), 1);
487 assert!(!report.is_ready());
488 }
489
490 #[test]
491 fn deregister_removes_component() {
492 let r = HealthReporter::new();
493 r.register("db", HealthStatus::Healthy);
494 r.deregister("db");
495 assert!(r.report().components.is_empty());
496 }
497
498 #[test]
499 fn component_health_builders() {
500 assert!(ComponentHealth::healthy("x").status.is_healthy());
501 assert!(
502 ComponentHealth::degraded("x", "slow")
503 .status
504 .is_operational()
505 );
506 assert!(
507 !ComponentHealth::unhealthy("x", "down")
508 .status
509 .is_operational()
510 );
511 }
512
513 #[test]
514 fn component_with_details_serializes() {
515 let c = ComponentHealth::healthy("pool").with_details(serde_json::json!({ "idle": 8 }));
516 assert!(c.details.is_some());
517 let json = serde_json::to_string(&c).unwrap();
518 assert!(json.contains("idle"));
519 }
520
521 #[test]
522 fn health_report_serializes_to_json() {
523 let r = HealthReporter::new();
524 r.register("db", HealthStatus::Healthy);
525 let report = r.report();
526 let json = serde_json::to_string(&report).expect("serialize");
527 assert!(json.contains("healthy"));
528 }
529
530 #[test]
531 fn aggregate_status_worst_wins() {
532 let components = vec![
533 ComponentHealth::healthy("a"),
534 ComponentHealth::degraded("b", "slow"),
535 ComponentHealth::unhealthy("c", "down"),
536 ];
537 let status = aggregate_status(&components);
538 assert!(matches!(status, HealthStatus::Unhealthy(_)));
539 }
540}