Skip to main content

stygian_graph/application/
schema_discovery.rs

1//! Intelligent schema discovery service
2//!
3//! Automatically infers JSON Schema definitions from:
4//! 1. **JSON/object examples** — type-inspects values to generate schema
5//! 2. **HTML content** — uses an AI provider to suggest extraction schemas
6//! 3. **User corrections** — applies overrides stored in a correction map
7//! 4. **Evolution detection** — compares new inferences against cached schemas
8//!
9//! The service sits in the application layer and depends on `AIProvider` for
10//! LLM-assisted HTML schema inference.
11//!
12//! # Example
13//!
14//! ```no_run
15//! use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
16//! use serde_json::json;
17//!
18//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
19//! let service = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
20//!
21//! // Infer from a JSON example
22//! let example = json!({"name": "Alice", "age": 30, "active": true});
23//! let schema = service.infer_from_example(&example);
24//! assert_eq!(schema["type"].as_str().unwrap(), "object");
25//! # });
26//! ```
27
28use std::collections::HashMap;
29use std::sync::Arc;
30
31use parking_lot::RwLock;
32use serde_json::{Map, Value, json};
33use tracing::debug;
34
35/// Full standard GraphQL introspection query (6-level `ofType` depth for Relay
36/// connection types such as `NON_NULL(LIST(NON_NULL(OBJECT)))`).
37const INTROSPECTION_QUERY: &str = r"
38query IntrospectionQuery {
39  __schema {
40    queryType { name }
41    mutationType { name }
42    subscriptionType { name }
43    types { ...FullType }
44    directives {
45      name description locations
46      args { ...InputValue }
47    }
48  }
49}
50fragment FullType on __Type {
51  kind name description
52  fields(includeDeprecated: true) {
53    name description
54    args { ...InputValue }
55    type { ...TypeRef }
56    isDeprecated deprecationReason
57  }
58  inputFields { ...InputValue }
59  interfaces { ...TypeRef }
60  enumValues(includeDeprecated: true) {
61    name description isDeprecated deprecationReason
62  }
63  possibleTypes { ...TypeRef }
64}
65fragment InputValue on __InputValue {
66  name description
67  type { ...TypeRef }
68  defaultValue
69}
70fragment TypeRef on __Type {
71  kind name
72  ofType {
73    kind name
74    ofType {
75      kind name
76      ofType {
77        kind name
78        ofType {
79          kind name
80          ofType {
81            kind name
82            ofType { kind name }
83          }
84        }
85      }
86    }
87  }
88}
89";
90
91use crate::domain::error::{ProviderError, Result, ServiceError, StygianError};
92use crate::ports::{AIProvider, GraphQlAuth, ScrapingService, ServiceInput};
93
94/// Discovered schema with metadata
95#[derive(Debug, Clone)]
96pub struct DiscoveredSchema {
97    /// The JSON Schema object
98    pub schema: Value,
99    /// How the schema was derived
100    pub source: SchemaSource,
101    /// Confidence score [0.0, 1.0]
102    pub confidence: f32,
103    /// URL pattern or domain the schema is associated with
104    pub url_pattern: Option<String>,
105}
106
107/// How a schema was derived
108#[derive(Debug, Clone, PartialEq, Eq)]
109pub enum SchemaSource {
110    /// Inferred by type-inspecting a JSON example
111    ExampleInference,
112    /// Generated by an AI provider analysing HTML structure
113    AiInference,
114    /// Loaded from cache
115    Cached,
116    /// Manually supplied / corrected by user
117    UserCorrection,
118}
119
120/// Configuration for the schema discovery service
121#[derive(Debug, Clone)]
122pub struct SchemaDiscoveryConfig {
123    /// Maximum HTML content length sent to AI provider
124    pub max_content_chars: usize,
125    /// Cache discovered schemas by URL pattern
126    pub cache_schemas: bool,
127    /// Minimum confidence to accept an AI-inferred schema
128    pub min_ai_confidence: f32,
129}
130
131impl Default for SchemaDiscoveryConfig {
132    fn default() -> Self {
133        Self {
134            max_content_chars: 16_000,
135            cache_schemas: true,
136            min_ai_confidence: 0.6,
137        }
138    }
139}
140
141/// Schema evolution diff — describes changes between two schema versions
142#[derive(Debug, Clone)]
143pub struct SchemaDiff {
144    /// Fields present in new schema but not in old
145    pub added_fields: Vec<String>,
146    /// Fields present in old schema but not in new
147    pub removed_fields: Vec<String>,
148    /// Fields whose type changed
149    pub type_changes: Vec<(String, String, String)>, // (field, old_type, new_type)
150    /// Whether schemas are considered compatible
151    pub is_compatible: bool,
152}
153
154/// Intelligent schema discovery service
155///
156/// Infers JSON Schemas from content using local heuristics and an optional
157/// AI provider. Maintains a schema cache for URL patterns and supports
158/// user corrections.
159pub struct SchemaDiscoveryService {
160    config: SchemaDiscoveryConfig,
161    /// Optional AI provider for HTML-to-schema inference
162    ai_provider: Option<Arc<dyn AIProvider>>,
163    /// Schema cache: `url_pattern` → [`DiscoveredSchema`]
164    cache: Arc<RwLock<HashMap<String, DiscoveredSchema>>>,
165    /// User correction map: `field_name` → corrected schema fragment
166    corrections: Arc<RwLock<HashMap<String, Value>>>,
167    /// Optional GraphQL scraping service for introspection
168    graphql_service: Option<Arc<dyn ScrapingService>>,
169}
170
171impl SchemaDiscoveryService {
172    /// Create a new schema discovery service
173    ///
174    /// # Example
175    ///
176    /// ```
177    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
178    ///
179    /// let service = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
180    /// ```
181    #[must_use]
182    pub fn new(config: SchemaDiscoveryConfig, ai_provider: Option<Arc<dyn AIProvider>>) -> Self {
183        Self {
184            config,
185            ai_provider,
186            cache: Arc::new(RwLock::new(HashMap::new())),
187            corrections: Arc::new(RwLock::new(HashMap::new())),
188            graphql_service: None,
189        }
190    }
191
192    /// Attach a GraphQL scraping service used by [`infer_from_graphql`](Self::infer_from_graphql).
193    ///
194    /// # Example
195    ///
196    /// ```no_run
197    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
198    /// use stygian_graph::adapters::graphql::GraphQlService;
199    /// use std::sync::Arc;
200    ///
201    /// let gql = Arc::new(GraphQlService::new(Default::default(), None));
202    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None)
203    ///     .with_graphql_service(gql);
204    /// ```
205    #[must_use]
206    pub fn with_graphql_service(mut self, service: Arc<dyn ScrapingService>) -> Self {
207        self.graphql_service = Some(service);
208        self
209    }
210
211    /// Infer a JSON Schema by type-inspecting a JSON example value.
212    ///
213    /// Supports objects, arrays, strings, numbers, booleans, and null.
214    ///
215    /// # Example
216    ///
217    /// ```
218    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
219    /// use serde_json::json;
220    ///
221    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
222    /// let example = json!({"name": "Alice", "score": 9.5, "active": true});
223    /// let schema = svc.infer_from_example(&example);
224    ///
225    /// assert_eq!(schema["type"].as_str().unwrap(), "object");
226    /// assert_eq!(schema["properties"]["name"]["type"].as_str().unwrap(), "string");
227    /// assert_eq!(schema["properties"]["score"]["type"].as_str().unwrap(), "number");
228    /// ```
229    #[must_use]
230    pub fn infer_from_example(&self, example: &Value) -> Value {
231        Self::infer_schema_for_value(example)
232    }
233
234    /// Recursively infer a JSON Schema for a single JSON value
235    fn infer_schema_for_value(value: &Value) -> Value {
236        match value {
237            Value::Object(map) => {
238                let mut properties = Map::new();
239                let mut required = Vec::new();
240                for (key, val) in map {
241                    properties.insert(key.clone(), Self::infer_schema_for_value(val));
242                    required.push(json!(key));
243                }
244                json!({
245                    "type": "object",
246                    "properties": Value::Object(properties),
247                    "required": required
248                })
249            }
250            Value::Array(items) => {
251                // Infer item schema from first element
252                let item_schema = items
253                    .first()
254                    .map_or_else(|| json!({}), Self::infer_schema_for_value);
255                json!({
256                    "type": "array",
257                    "items": item_schema
258                })
259            }
260            Value::String(_) => json!({"type": "string"}),
261            Value::Number(n) => {
262                if n.is_i64() || n.is_u64() {
263                    json!({"type": "integer"})
264                } else {
265                    json!({"type": "number"})
266                }
267            }
268            Value::Bool(_) => json!({"type": "boolean"}),
269            Value::Null => json!({"type": "null"}),
270        }
271    }
272
273    /// Infer a schema from HTML content using an AI provider.
274    ///
275    /// Returns an error if no AI provider is configured.
276    ///
277    /// # Example
278    ///
279    /// ```no_run
280    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
281    ///
282    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
283    /// let service = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
284    /// // Without an AI provider, this returns Err
285    /// let result = service.infer_from_html("<html>...</html>", None).await;
286    /// assert!(result.is_err());
287    /// # });
288    /// ```
289    ///
290    /// # Panics
291    ///
292    /// Panics if the internal `RwLock` is poisoned (i.e. another thread panicked
293    /// while holding the lock). Treat this as unrecoverable.
294    ///
295    /// # Errors
296    ///
297    /// Returns [`StygianError`] when no `AIProvider` is configured, the AI
298    /// provider fails, the supplied HTML cannot be parsed, or the cache lookup
299    /// fails.
300    pub async fn infer_from_html(
301        &self,
302        html: &str,
303        url_pattern: Option<&str>,
304    ) -> Result<DiscoveredSchema> {
305        // Check cache first
306        if self.config.cache_schemas
307            && let Some(pattern) = url_pattern
308        {
309            let cache = self.cache.read();
310            if let Some(cached) = cache.get(pattern) {
311                debug!(pattern, "Schema cache hit");
312                return Ok(DiscoveredSchema {
313                    source: SchemaSource::Cached,
314                    ..cached.clone()
315                });
316            }
317        }
318
319        let provider = self.ai_provider.as_ref().ok_or_else(|| {
320            StygianError::Provider(ProviderError::ApiError(
321                "No AI provider configured for HTML schema discovery".to_string(),
322            ))
323        })?;
324
325        let truncated = if html.len() > self.config.max_content_chars {
326            &html[..self.config.max_content_chars]
327        } else {
328            html
329        };
330
331        let discovery_schema = json!({
332            "type": "object",
333            "properties": {
334                "schema": {
335                    "type": "object",
336                    "description": "A valid JSON Schema object describing the structured data in the HTML"
337                },
338                "confidence": {
339                    "type": "number",
340                    "minimum": 0.0,
341                    "maximum": 1.0,
342                    "description": "Confidence in the inferred schema [0.0, 1.0]"
343                },
344                "description": {
345                    "type": "string",
346                    "description": "Human-readable explanation of what data was detected"
347                }
348            },
349            "required": ["schema", "confidence"]
350        });
351
352        let prompt = format!(
353            "Analyze the following HTML page and infer a JSON Schema for the main structured \
354             data it contains (e.g. product listings, articles, search results). \
355             Return the schema definition.\n\nHTML:\n{truncated}"
356        );
357
358        let result = provider.extract(prompt, discovery_schema).await?;
359
360        let schema = result
361            .get("schema")
362            .cloned()
363            .unwrap_or_else(|| json!({"type": "object"}));
364
365        #[allow(clippy::cast_possible_truncation)]
366        let confidence = result
367            .get("confidence")
368            .and_then(Value::as_f64)
369            .unwrap_or(0.7) as f32;
370
371        // Apply any user corrections
372        let schema = self.apply_corrections(schema);
373
374        let discovered = DiscoveredSchema {
375            schema,
376            source: SchemaSource::AiInference,
377            confidence,
378            url_pattern: url_pattern.map(str::to_string),
379        };
380
381        // Cache the result
382        if self.config.cache_schemas
383            && let Some(pattern) = url_pattern
384        {
385            self.cache
386                .write()
387                .insert(pattern.to_string(), discovered.clone());
388        }
389
390        Ok(discovered)
391    }
392
393    /// Apply user corrections to an inferred schema
394    fn apply_corrections(&self, mut schema: Value) -> Value {
395        let corrections = self.corrections.read();
396        if corrections.is_empty() {
397            return schema;
398        }
399
400        if let Some(props) = schema.get_mut("properties").and_then(Value::as_object_mut) {
401            for (field, correction) in corrections.iter() {
402                if props.contains_key(field) {
403                    props.insert(field.clone(), correction.clone());
404                }
405            }
406        }
407        schema
408    }
409
410    /// Register a user correction for a specific field type
411    ///
412    /// # Example
413    ///
414    /// ```
415    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
416    /// use serde_json::json;
417    ///
418    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
419    /// // Always use "integer" for "id" fields regardless of inferred type
420    /// svc.add_correction("id".to_string(), json!({"type": "integer"}));
421    /// ```
422    pub fn add_correction(&self, field_name: String, schema_fragment: Value) {
423        self.corrections.write().insert(field_name, schema_fragment);
424    }
425
426    /// Cache a schema for a URL pattern (useful for preloading known schemas)
427    ///
428    /// # Example
429    ///
430    /// ```
431    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig, SchemaSource};
432    /// use serde_json::json;
433    ///
434    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
435    /// svc.cache_schema(
436    ///     "example.com/products".to_string(),
437    ///     json!({"type": "object"}),
438    ///     SchemaSource::UserCorrection,
439    ///     1.0,
440    /// );
441    /// ```
442    pub fn cache_schema(
443        &self,
444        url_pattern: String,
445        schema: Value,
446        source: SchemaSource,
447        confidence: f32,
448    ) {
449        let discovered = DiscoveredSchema {
450            schema,
451            source,
452            confidence,
453            url_pattern: Some(url_pattern.clone()),
454        };
455        self.cache.write().insert(url_pattern, discovered);
456    }
457
458    /// Detect schema evolution between a known schema and a newly inferred one.
459    ///
460    /// Returns a diff describing added/removed/changed fields.
461    ///
462    /// # Example
463    ///
464    /// ```
465    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
466    /// use serde_json::json;
467    ///
468    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
469    ///
470    /// let old = json!({
471    ///     "type": "object",
472    ///     "properties": {"name": {"type": "string"}, "price": {"type": "number"}}
473    /// });
474    /// let new = json!({
475    ///     "type": "object",
476    ///     "properties": {"name": {"type": "string"}, "discount": {"type": "number"}}
477    /// });
478    ///
479    /// let diff = svc.detect_evolution(&old, &new);
480    /// assert!(diff.added_fields.contains(&"discount".to_string()));
481    /// assert!(diff.removed_fields.contains(&"price".to_string()));
482    /// ```
483    pub fn detect_evolution(&self, old_schema: &Value, new_schema: &Value) -> SchemaDiff {
484        let old_props = old_schema
485            .get("properties")
486            .and_then(Value::as_object)
487            .cloned()
488            .unwrap_or_default();
489        let new_props = new_schema
490            .get("properties")
491            .and_then(Value::as_object)
492            .cloned()
493            .unwrap_or_default();
494
495        let added_fields: Vec<String> = new_props
496            .keys()
497            .filter(|k| !old_props.contains_key(*k))
498            .cloned()
499            .collect();
500
501        let removed_fields: Vec<String> = old_props
502            .keys()
503            .filter(|k| !new_props.contains_key(*k))
504            .cloned()
505            .collect();
506
507        let type_changes: Vec<(String, String, String)> = old_props
508            .iter()
509            .filter_map(|(k, old_v)| {
510                let new_v = new_props.get(k)?;
511                let old_t = old_v.get("type").and_then(Value::as_str).unwrap_or("any");
512                let new_t = new_v.get("type").and_then(Value::as_str).unwrap_or("any");
513                (old_t != new_t).then(|| (k.clone(), old_t.to_string(), new_t.to_string()))
514            })
515            .collect();
516
517        let is_compatible = removed_fields.is_empty() && type_changes.is_empty();
518
519        SchemaDiff {
520            added_fields,
521            removed_fields,
522            type_changes,
523            is_compatible,
524        }
525    }
526
527    /// Discover a JSON Schema by introspecting a live GraphQL endpoint.
528    ///
529    /// Fires the standard `__schema` introspection query, converts the returned
530    /// SDL types to a JSON Schema representation, and caches the result by
531    /// endpoint URL.
532    ///
533    /// # Errors
534    ///
535    /// Returns `Err` if no GraphQL service is configured, if the endpoint is
536    /// unreachable, if the response contains GraphQL errors, or if the
537    /// introspection response cannot be parsed.
538    ///
539    /// # Example
540    ///
541    /// ```no_run
542    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
543    /// use stygian_graph::adapters::graphql::GraphQlService;
544    /// use std::sync::Arc;
545    ///
546    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
547    /// let gql = Arc::new(GraphQlService::new(Default::default(), None));
548    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None)
549    ///     .with_graphql_service(gql);
550    /// let schema = svc.infer_from_graphql("https://api.example.com/graphql", None).await;
551    /// # });
552    /// ```
553    #[allow(clippy::indexing_slicing)]
554    pub async fn infer_from_graphql(
555        &self,
556        endpoint: &str,
557        auth: Option<GraphQlAuth>,
558    ) -> Result<DiscoveredSchema> {
559        // ── 1. Cache check ────────────────────────────────────────────────
560        if self.config.cache_schemas {
561            let cache = self.cache.read();
562            if let Some(cached) = cache.get(endpoint) {
563                debug!(endpoint, "GraphQL introspection cache hit");
564                return Ok(DiscoveredSchema {
565                    source: SchemaSource::Cached,
566                    ..cached.clone()
567                });
568            }
569        }
570
571        // ── 2. Require a GraphQL service ──────────────────────────────────
572        let service = self.graphql_service.as_ref().ok_or_else(|| {
573            StygianError::Service(ServiceError::Unavailable(
574                "No GraphQL service configured for introspection".to_string(),
575            ))
576        })?;
577
578        // ── 3. Build ServiceInput with introspection query ────────────────
579        let mut params = json!({
580            "query": INTROSPECTION_QUERY,
581        });
582        if let Some(a) = auth {
583            params["auth"] = json!({
584                "kind": match a.kind {
585                    crate::ports::GraphQlAuthKind::Bearer => "bearer",
586                    crate::ports::GraphQlAuthKind::ApiKey => "api_key",
587                    crate::ports::GraphQlAuthKind::Header => "header",
588                    crate::ports::GraphQlAuthKind::None => "none",
589                },
590                "token": a.token,
591            });
592            if let Some(header) = a.header_name {
593                params["auth"]["header_name"] = json!(header);
594            }
595        }
596
597        let input = ServiceInput {
598            url: endpoint.to_string(),
599            params,
600        };
601
602        // ── 4. Execute ────────────────────────────────────────────────────
603        let output = service.execute(input).await?;
604
605        // ── 5. Parse __schema ─────────────────────────────────────────────
606        let response_json: Value = serde_json::from_str(&output.data)
607            .map_err(|e| StygianError::Service(ServiceError::InvalidResponse(e.to_string())))?;
608        let introspection = response_json.get("__schema").ok_or_else(|| {
609            StygianError::Service(ServiceError::InvalidResponse(
610                "introspection response missing __schema key".to_string(),
611            ))
612        })?;
613
614        // ── 6. Build JSON Schema from types ───────────────────────────────
615        let query_type_name = introspection
616            .get("queryType")
617            .and_then(|qt| qt.get("name"))
618            .and_then(Value::as_str)
619            .unwrap_or("Query")
620            .to_string();
621
622        let types = introspection
623            .get("types")
624            .and_then(Value::as_array)
625            .cloned()
626            .unwrap_or_default();
627
628        let mut definitions = Map::new();
629        for gql_type in &types {
630            let name = gql_type.get("name").and_then(Value::as_str).unwrap_or("");
631            // Skip built-in introspection types
632            if name.is_empty() || name.starts_with('_') {
633                continue;
634            }
635            let def = Self::convert_type_to_definition(gql_type);
636            definitions.insert(name.to_string(), def);
637        }
638
639        let schema = json!({
640            "type": "object",
641            "description": format!("GraphQL schema for {endpoint}"),
642            "definitions": Value::Object(definitions),
643            "queryType": query_type_name,
644        });
645
646        let discovered = DiscoveredSchema {
647            schema,
648            source: SchemaSource::AiInference,
649            confidence: 1.0,
650            url_pattern: Some(endpoint.to_string()),
651        };
652
653        // ── 7. Cache ───────────────────────────────────────────────────────
654        if self.config.cache_schemas {
655            self.cache
656                .write()
657                .insert(endpoint.to_string(), discovered.clone());
658        }
659
660        Ok(discovered)
661    }
662
663    /// Convert a top-level GraphQL type definition to a JSON Schema object.
664    #[allow(clippy::indexing_slicing)]
665    fn convert_type_to_definition(gql_type: &Value) -> Value {
666        let kind = gql_type.get("kind").and_then(Value::as_str).unwrap_or("");
667        match kind {
668            "OBJECT" | "INTERFACE" | "INPUT_OBJECT" => {
669                let fields: Vec<Value> = gql_type
670                    .get("fields")
671                    .or_else(|| gql_type.get("inputFields"))
672                    .and_then(Value::as_array)
673                    .cloned()
674                    .unwrap_or_default();
675
676                let mut properties = Map::new();
677                let mut required: Vec<Value> = Vec::new();
678
679                for field in &fields {
680                    let field_name = field.get("name").and_then(Value::as_str).unwrap_or("");
681                    if field_name.is_empty() {
682                        continue;
683                    }
684                    let type_ref = &field["type"];
685                    // NON_NULL at the outermost level means the field is required
686                    if type_ref.get("kind").and_then(Value::as_str) == Some("NON_NULL") {
687                        required.push(json!(field_name));
688                    }
689                    properties.insert(field_name.to_string(), Self::convert_type_ref(type_ref));
690                }
691
692                let mut obj = json!({
693                    "type": "object",
694                    "properties": Value::Object(properties),
695                });
696                if !required.is_empty() {
697                    obj["required"] = json!(required);
698                }
699                obj
700            }
701            "ENUM" => {
702                let values: Vec<Value> = gql_type
703                    .get("enumValues")
704                    .and_then(Value::as_array)
705                    .map(|vals| {
706                        vals.iter()
707                            .filter_map(|v| v.get("name").and_then(Value::as_str))
708                            .map(|s| json!(s))
709                            .collect()
710                    })
711                    .unwrap_or_default();
712                json!({ "type": "string", "enum": values })
713            }
714            "SCALAR" => {
715                let name = gql_type.get("name").and_then(Value::as_str).unwrap_or("");
716                Self::scalar_to_json_schema(name)
717            }
718            _ => json!({}),
719        }
720    }
721
722    /// Recursively convert a GraphQL type reference (field type) to JSON Schema.
723    fn convert_type_ref(type_ref: &Value) -> Value {
724        let kind = type_ref.get("kind").and_then(Value::as_str).unwrap_or("");
725        match kind {
726            "NON_NULL" => Self::convert_type_ref(&type_ref["ofType"]),
727            "LIST" => json!({
728                "type": "array",
729                "items": Self::convert_type_ref(&type_ref["ofType"]),
730            }),
731            "OBJECT" | "INTERFACE" | "INPUT_OBJECT" | "ENUM" => {
732                let name = type_ref.get("name").and_then(Value::as_str).unwrap_or("");
733                json!({ "$ref": format!("#/definitions/{name}") })
734            }
735            "SCALAR" => {
736                let name = type_ref.get("name").and_then(Value::as_str).unwrap_or("");
737                Self::scalar_to_json_schema(name)
738            }
739            _ => json!({}),
740        }
741    }
742
743    /// Map a GraphQL scalar name to its JSON Schema type.
744    fn scalar_to_json_schema(name: &str) -> Value {
745        match name {
746            "String" | "ID" => json!({ "type": "string" }),
747            "Int" => json!({ "type": "integer" }),
748            "Float" => json!({ "type": "number" }),
749            "Boolean" => json!({ "type": "boolean" }),
750            s if s.to_ascii_lowercase().contains("datetime")
751                || s.to_ascii_lowercase().contains("date") =>
752            {
753                json!({ "type": "string", "format": "date-time" })
754            }
755            _ => json!({}),
756        }
757    }
758
759    /// Validate that a value is a structurally valid JSON Schema object.
760    ///
761    /// Checks for the presence of a `type` or `properties` field.
762    ///
763    /// # Example
764    ///
765    /// ```
766    /// use stygian_graph::application::schema_discovery::SchemaDiscoveryService;
767    /// use serde_json::json;
768    ///
769    /// assert!(SchemaDiscoveryService::is_valid_schema(&json!({"type": "object"})));
770    /// assert!(!SchemaDiscoveryService::is_valid_schema(&json!("not a schema")));
771    /// ```
772    #[must_use]
773    pub fn is_valid_schema(schema: &Value) -> bool {
774        if !schema.is_object() {
775            return false;
776        }
777        schema.get("type").is_some() || schema.get("properties").is_some()
778    }
779}
780
781#[cfg(test)]
782#[allow(
783    clippy::unwrap_used,
784    clippy::indexing_slicing,
785    clippy::approx_constant,
786    clippy::significant_drop_tightening,
787    clippy::float_cmp,
788    clippy::unnecessary_map_or
789)]
790mod tests {
791    use super::*;
792    use serde_json::json;
793
794    fn svc() -> SchemaDiscoveryService {
795        SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None)
796    }
797
798    // --- infer_from_example ---
799
800    #[test]
801    fn test_infer_object_schema() {
802        let s = svc();
803        let schema = s.infer_from_example(&json!({"name": "Alice", "age": 30, "active": true}));
804        assert_eq!(schema["type"].as_str().unwrap(), "object");
805        assert_eq!(
806            schema["properties"]["name"]["type"].as_str().unwrap(),
807            "string"
808        );
809        assert_eq!(
810            schema["properties"]["age"]["type"].as_str().unwrap(),
811            "integer"
812        );
813        assert_eq!(
814            schema["properties"]["active"]["type"].as_str().unwrap(),
815            "boolean"
816        );
817    }
818
819    #[test]
820    fn test_infer_array_schema() {
821        let s = svc();
822        let schema = s.infer_from_example(&json!([{"id": 1}, {"id": 2}]));
823        assert_eq!(schema["type"].as_str().unwrap(), "array");
824        assert_eq!(schema["items"]["type"].as_str().unwrap(), "object");
825    }
826
827    #[test]
828    fn test_infer_string_schema() {
829        let s = svc();
830        let schema = s.infer_from_example(&json!("hello"));
831        assert_eq!(schema["type"].as_str().unwrap(), "string");
832    }
833
834    #[test]
835    fn test_infer_number_float() {
836        let s = svc();
837        let schema = s.infer_from_example(&json!(3.14));
838        assert_eq!(schema["type"].as_str().unwrap(), "number");
839    }
840
841    #[test]
842    fn test_infer_null() {
843        let s = svc();
844        let schema = s.infer_from_example(&Value::Null);
845        assert_eq!(schema["type"].as_str().unwrap(), "null");
846    }
847
848    // --- is_valid_schema ---
849
850    #[test]
851    fn test_valid_schema_with_type() {
852        assert!(SchemaDiscoveryService::is_valid_schema(
853            &json!({"type": "object"})
854        ));
855    }
856
857    #[test]
858    fn test_valid_schema_with_properties() {
859        assert!(SchemaDiscoveryService::is_valid_schema(
860            &json!({"properties": {"x": {"type": "string"}}})
861        ));
862    }
863
864    #[test]
865    fn test_invalid_schema_string() {
866        assert!(!SchemaDiscoveryService::is_valid_schema(&json!(
867            "not schema"
868        )));
869    }
870
871    #[test]
872    fn test_invalid_schema_empty_object() {
873        // An empty object has neither "type" nor "properties"
874        assert!(!SchemaDiscoveryService::is_valid_schema(&json!({})));
875    }
876
877    // --- detect_evolution ---
878
879    #[test]
880    fn test_evolution_added_field() {
881        let s = svc();
882        let old = json!({"type": "object", "properties": {"name": {"type": "string"}}});
883        let new = json!({"type": "object", "properties": {"name": {"type": "string"}, "email": {"type": "string"}}});
884        let diff = s.detect_evolution(&old, &new);
885        assert!(diff.added_fields.contains(&"email".to_string()));
886        assert!(diff.removed_fields.is_empty());
887        assert!(diff.is_compatible);
888    }
889
890    #[test]
891    fn test_evolution_removed_field() {
892        let s = svc();
893        let old = json!({"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}});
894        let new = json!({"type": "object", "properties": {"name": {"type": "string"}}});
895        let diff = s.detect_evolution(&old, &new);
896        assert!(diff.removed_fields.contains(&"age".to_string()));
897        assert!(!diff.is_compatible);
898    }
899
900    #[test]
901    fn test_evolution_type_change() {
902        let s = svc();
903        let old = json!({"type": "object", "properties": {"id": {"type": "string"}}});
904        let new = json!({"type": "object", "properties": {"id": {"type": "integer"}}});
905        let diff = s.detect_evolution(&old, &new);
906        assert_eq!(diff.type_changes.len(), 1);
907        assert_eq!(
908            diff.type_changes[0],
909            (
910                "id".to_string(),
911                "string".to_string(),
912                "integer".to_string()
913            )
914        );
915        assert!(!diff.is_compatible);
916    }
917
918    #[test]
919    fn test_evolution_no_change() {
920        let s = svc();
921        let schema = json!({"type": "object", "properties": {"x": {"type": "string"}}});
922        let diff = s.detect_evolution(&schema, &schema);
923        assert!(diff.added_fields.is_empty());
924        assert!(diff.removed_fields.is_empty());
925        assert!(diff.is_compatible);
926    }
927
928    // --- corrections ---
929
930    #[test]
931    fn test_correction_overrides_inferred_type() {
932        let s = svc();
933        s.add_correction("id".to_string(), json!({"type": "integer"}));
934
935        let schema = json!({
936            "type": "object",
937            "properties": {
938                "id": {"type": "string"},
939                "name": {"type": "string"}
940            }
941        });
942        let corrected = s.apply_corrections(schema);
943        assert_eq!(
944            corrected["properties"]["id"]["type"].as_str().unwrap(),
945            "integer"
946        );
947        assert_eq!(
948            corrected["properties"]["name"]["type"].as_str().unwrap(),
949            "string"
950        );
951    }
952
953    // --- cache ---
954
955    #[test]
956    fn test_cache_and_retrieve() {
957        let s = svc();
958        let schema = json!({"type": "object", "properties": {"title": {"type": "string"}}});
959        s.cache_schema(
960            "example.com".to_string(),
961            schema.clone(),
962            SchemaSource::UserCorrection,
963            1.0,
964        );
965
966        let cache = s.cache.read();
967        let entry = cache.get("example.com").unwrap();
968        assert_eq!(entry.schema, schema);
969        assert_eq!(entry.source, SchemaSource::UserCorrection);
970    }
971
972    // --- HTML inference without provider ---
973
974    #[tokio::test]
975    async fn test_infer_from_html_no_provider() {
976        let s = svc();
977        let result = s
978            .infer_from_html("<html><body>test</body></html>", None)
979            .await;
980        assert!(result.is_err());
981        assert!(result.unwrap_err().to_string().contains("No AI provider"));
982    }
983
984    // --- HTML inference with cache hit ---
985
986    #[tokio::test]
987    async fn test_infer_from_html_cache_hit() {
988        let s = svc();
989        let schema = json!({"type": "object"});
990        s.cache_schema(
991            "example.com".to_string(),
992            schema.clone(),
993            SchemaSource::Cached,
994            0.9,
995        );
996
997        let result = s
998            .infer_from_html("<html/>", Some("example.com"))
999            .await
1000            .unwrap();
1001        assert_eq!(result.source, SchemaSource::Cached);
1002        assert_eq!(result.schema, schema);
1003    }
1004
1005    // ── GraphQL introspection tests ──────────────────────────────────────────
1006
1007    /// Minimal mock [`ScrapingService`] that returns a pre-baked `Value` as `data`.
1008    struct MockGraphQlService(Value);
1009
1010    #[async_trait::async_trait]
1011    impl ScrapingService for MockGraphQlService {
1012        fn name(&self) -> &'static str {
1013            "mock_graphql"
1014        }
1015        async fn execute(
1016            &self,
1017            _input: crate::ports::ServiceInput,
1018        ) -> Result<crate::ports::ServiceOutput> {
1019            Ok(crate::ports::ServiceOutput {
1020                data: serde_json::to_string(&self.0).unwrap_or_default(),
1021                metadata: Value::default(),
1022            })
1023        }
1024    }
1025
1026    impl SchemaDiscoveryService {
1027        /// Test-only constructor: pre-bake a GraphQL introspection response.
1028        ///
1029        /// The `response` value should represent the `data` object that a real
1030        /// GraphQL service would return (i.e. `{ "__schema": { ... } }`).
1031        fn with_mock_graphql(config: SchemaDiscoveryConfig, response: Value) -> Self {
1032            let svc = Arc::new(MockGraphQlService(response));
1033            Self::new(config, None).with_graphql_service(svc)
1034        }
1035    }
1036
1037    fn make_introspection_response_simple() -> Value {
1038        json!({
1039            "__schema": {
1040                "queryType": { "name": "Query" },
1041                "mutationType": null,
1042                "subscriptionType": null,
1043                "types": [
1044                    {
1045                        "kind": "OBJECT",
1046                        "name": "Query",
1047                        "fields": [
1048                            {
1049                                "name": "user",
1050                                "type": { "kind": "OBJECT", "name": "User", "ofType": null }
1051                            }
1052                        ],
1053                        "inputFields": null,
1054                        "interfaces": [],
1055                        "enumValues": null,
1056                        "possibleTypes": null
1057                    },
1058                    {
1059                        "kind": "OBJECT",
1060                        "name": "User",
1061                        "fields": [
1062                            {
1063                                "name": "id",
1064                                "type": { "kind": "NON_NULL", "name": null, "ofType": { "kind": "SCALAR", "name": "ID", "ofType": null } }
1065                            },
1066                            {
1067                                "name": "name",
1068                                "type": { "kind": "SCALAR", "name": "String", "ofType": null }
1069                            }
1070                        ],
1071                        "inputFields": null,
1072                        "interfaces": [],
1073                        "enumValues": null,
1074                        "possibleTypes": null
1075                    },
1076                    {
1077                        "kind": "SCALAR",
1078                        "name": "String",
1079                        "fields": null,
1080                        "inputFields": null,
1081                        "interfaces": null,
1082                        "enumValues": null,
1083                        "possibleTypes": null
1084                    },
1085                    {
1086                        "kind": "SCALAR",
1087                        "name": "ID",
1088                        "fields": null,
1089                        "inputFields": null,
1090                        "interfaces": null,
1091                        "enumValues": null,
1092                        "possibleTypes": null
1093                    }
1094                ],
1095                "directives": []
1096            }
1097        })
1098    }
1099
1100    #[tokio::test]
1101    async fn introspection_no_graphql_service() {
1102        let s = svc(); // no graphql_service configured
1103        let result = s
1104            .infer_from_graphql("https://api.example.com/graphql", None)
1105            .await;
1106        assert!(result.is_err());
1107        assert!(
1108            result
1109                .unwrap_err()
1110                .to_string()
1111                .contains("No GraphQL service")
1112        );
1113    }
1114
1115    #[test]
1116    fn graphql_type_to_json_schema_scalar() {
1117        assert_eq!(
1118            SchemaDiscoveryService::scalar_to_json_schema("String"),
1119            json!({"type": "string"})
1120        );
1121        assert_eq!(
1122            SchemaDiscoveryService::scalar_to_json_schema("Int"),
1123            json!({"type": "integer"})
1124        );
1125        assert_eq!(
1126            SchemaDiscoveryService::scalar_to_json_schema("Float"),
1127            json!({"type": "number"})
1128        );
1129        assert_eq!(
1130            SchemaDiscoveryService::scalar_to_json_schema("Boolean"),
1131            json!({"type": "boolean"})
1132        );
1133        assert_eq!(
1134            SchemaDiscoveryService::scalar_to_json_schema("ID"),
1135            json!({"type": "string"})
1136        );
1137        assert_eq!(
1138            SchemaDiscoveryService::scalar_to_json_schema("DateTime"),
1139            json!({"type": "string", "format": "date-time"})
1140        );
1141        // Unknown scalar → permissive empty schema
1142        assert_eq!(
1143            SchemaDiscoveryService::scalar_to_json_schema("JSON"),
1144            json!({})
1145        );
1146    }
1147
1148    #[test]
1149    fn graphql_type_to_json_schema_object() {
1150        let gql_type = json!({
1151            "kind": "OBJECT",
1152            "name": "User",
1153            "fields": [
1154                {
1155                    "name": "id",
1156                    "type": { "kind": "SCALAR", "name": "ID", "ofType": null }
1157                },
1158                {
1159                    "name": "email",
1160                    "type": { "kind": "SCALAR", "name": "String", "ofType": null }
1161                }
1162            ]
1163        });
1164        let schema = SchemaDiscoveryService::convert_type_to_definition(&gql_type);
1165        assert_eq!(schema["type"].as_str().unwrap(), "object");
1166        assert_eq!(
1167            schema["properties"]["id"]["type"].as_str().unwrap(),
1168            "string"
1169        );
1170        assert_eq!(
1171            schema["properties"]["email"]["type"].as_str().unwrap(),
1172            "string"
1173        );
1174        // Neither field is NON_NULL, so no required array
1175        assert!(
1176            schema.get("required").is_none()
1177                || schema["required"].as_array().is_none_or(Vec::is_empty)
1178        );
1179    }
1180
1181    #[test]
1182    fn graphql_non_null_adds_to_required() {
1183        let gql_type = json!({
1184            "kind": "OBJECT",
1185            "name": "Product",
1186            "fields": [
1187                {
1188                    "name": "sku",
1189                    "type": {
1190                        "kind": "NON_NULL",
1191                        "name": null,
1192                        "ofType": { "kind": "SCALAR", "name": "String", "ofType": null }
1193                    }
1194                },
1195                {
1196                    "name": "description",
1197                    "type": { "kind": "SCALAR", "name": "String", "ofType": null }
1198                }
1199            ]
1200        });
1201        let schema = SchemaDiscoveryService::convert_type_to_definition(&gql_type);
1202        let required = schema["required"].as_array().unwrap();
1203        assert!(required.contains(&json!("sku")));
1204        assert!(!required.contains(&json!("description")));
1205    }
1206
1207    #[tokio::test]
1208    async fn introspection_result_cached() {
1209        let response = make_introspection_response_simple();
1210        let s =
1211            SchemaDiscoveryService::with_mock_graphql(SchemaDiscoveryConfig::default(), response);
1212
1213        // First call — fetches from mock service
1214        let first = s
1215            .infer_from_graphql("https://api.example.com/graphql", None)
1216            .await
1217            .unwrap();
1218        assert_eq!(first.source, SchemaSource::AiInference);
1219        assert_eq!(first.confidence, 1.0);
1220
1221        // Second call — served from cache (source changes to Cached)
1222        let second = s
1223            .infer_from_graphql("https://api.example.com/graphql", None)
1224            .await
1225            .unwrap();
1226        assert_eq!(second.source, SchemaSource::Cached);
1227        assert_eq!(second.schema["queryType"].as_str().unwrap(), "Query");
1228    }
1229}