stygian_graph/application/
schema_discovery.rs

1//! Intelligent schema discovery service
2//!
3//! Automatically infers JSON Schema definitions from:
4//! 1. **JSON/object examples** — type-inspects values to generate schema
5//! 2. **HTML content** — uses an AI provider to suggest extraction schemas
6//! 3. **User corrections** — applies overrides stored in a correction map
7//! 4. **Evolution detection** — compares new inferences against cached schemas
8//!
9//! The service sits in the application layer and depends on `AIProvider` for
10//! LLM-assisted HTML schema inference.
11//!
12//! # Example
13//!
14//! ```no_run
15//! use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
16//! use serde_json::json;
17//!
18//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
19//! let service = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
20//!
21//! // Infer from a JSON example
22//! let example = json!({"name": "Alice", "age": 30, "active": true});
23//! let schema = service.infer_from_example(&example);
24//! assert_eq!(schema["type"].as_str().unwrap(), "object");
25//! # });
26//! ```
27
28use std::collections::HashMap;
29use std::sync::Arc;
30
31use parking_lot::RwLock;
32use serde_json::{Map, Value, json};
33use tracing::debug;
34
35/// Full standard GraphQL introspection query (6-level `ofType` depth for Relay
36/// connection types such as `NON_NULL(LIST(NON_NULL(OBJECT)))`).
37const INTROSPECTION_QUERY: &str = r"
38query IntrospectionQuery {
39  __schema {
40    queryType { name }
41    mutationType { name }
42    subscriptionType { name }
43    types { ...FullType }
44    directives {
45      name description locations
46      args { ...InputValue }
47    }
48  }
49}
50fragment FullType on __Type {
51  kind name description
52  fields(includeDeprecated: true) {
53    name description
54    args { ...InputValue }
55    type { ...TypeRef }
56    isDeprecated deprecationReason
57  }
58  inputFields { ...InputValue }
59  interfaces { ...TypeRef }
60  enumValues(includeDeprecated: true) {
61    name description isDeprecated deprecationReason
62  }
63  possibleTypes { ...TypeRef }
64}
65fragment InputValue on __InputValue {
66  name description
67  type { ...TypeRef }
68  defaultValue
69}
70fragment TypeRef on __Type {
71  kind name
72  ofType {
73    kind name
74    ofType {
75      kind name
76      ofType {
77        kind name
78        ofType {
79          kind name
80          ofType {
81            kind name
82            ofType { kind name }
83          }
84        }
85      }
86    }
87  }
88}
89";
90
91use crate::domain::error::{ProviderError, Result, ServiceError, StygianError};
92use crate::ports::{AIProvider, GraphQlAuth, ScrapingService, ServiceInput};
93
94/// Discovered schema with metadata
95#[derive(Debug, Clone)]
96pub struct DiscoveredSchema {
97    /// The JSON Schema object
98    pub schema: Value,
99    /// How the schema was derived
100    pub source: SchemaSource,
101    /// Confidence score [0.0, 1.0]
102    pub confidence: f32,
103    /// URL pattern or domain the schema is associated with
104    pub url_pattern: Option<String>,
105}
106
107/// How a schema was derived
108#[derive(Debug, Clone, PartialEq, Eq)]
109pub enum SchemaSource {
110    /// Inferred by type-inspecting a JSON example
111    ExampleInference,
112    /// Generated by an AI provider analysing HTML structure
113    AiInference,
114    /// Loaded from cache
115    Cached,
116    /// Manually supplied / corrected by user
117    UserCorrection,
118}
119
120/// Configuration for the schema discovery service
121#[derive(Debug, Clone)]
122pub struct SchemaDiscoveryConfig {
123    /// Maximum HTML content length sent to AI provider
124    pub max_content_chars: usize,
125    /// Cache discovered schemas by URL pattern
126    pub cache_schemas: bool,
127    /// Minimum confidence to accept an AI-inferred schema
128    pub min_ai_confidence: f32,
129}
130
131impl Default for SchemaDiscoveryConfig {
132    fn default() -> Self {
133        Self {
134            max_content_chars: 16_000,
135            cache_schemas: true,
136            min_ai_confidence: 0.6,
137        }
138    }
139}
140
141/// Schema evolution diff — describes changes between two schema versions
142#[derive(Debug, Clone)]
143pub struct SchemaDiff {
144    /// Fields present in new schema but not in old
145    pub added_fields: Vec<String>,
146    /// Fields present in old schema but not in new
147    pub removed_fields: Vec<String>,
148    /// Fields whose type changed
149    pub type_changes: Vec<(String, String, String)>, // (field, old_type, new_type)
150    /// Whether schemas are considered compatible
151    pub is_compatible: bool,
152}
153
154/// Intelligent schema discovery service
155///
156/// Infers JSON Schemas from content using local heuristics and an optional
157/// AI provider. Maintains a schema cache for URL patterns and supports
158/// user corrections.
159pub struct SchemaDiscoveryService {
160    config: SchemaDiscoveryConfig,
161    /// Optional AI provider for HTML-to-schema inference
162    ai_provider: Option<Arc<dyn AIProvider>>,
163    /// Schema cache: `url_pattern` → [`DiscoveredSchema`]
164    cache: Arc<RwLock<HashMap<String, DiscoveredSchema>>>,
165    /// User correction map: `field_name` → corrected schema fragment
166    corrections: Arc<RwLock<HashMap<String, Value>>>,
167    /// Optional GraphQL scraping service for introspection
168    graphql_service: Option<Arc<dyn ScrapingService>>,
169}
170
171impl SchemaDiscoveryService {
172    /// Create a new schema discovery service
173    ///
174    /// # Example
175    ///
176    /// ```
177    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
178    ///
179    /// let service = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
180    /// ```
181    pub fn new(config: SchemaDiscoveryConfig, ai_provider: Option<Arc<dyn AIProvider>>) -> Self {
182        Self {
183            config,
184            ai_provider,
185            cache: Arc::new(RwLock::new(HashMap::new())),
186            corrections: Arc::new(RwLock::new(HashMap::new())),
187            graphql_service: None,
188        }
189    }
190
191    /// Attach a GraphQL scraping service used by [`infer_from_graphql`](Self::infer_from_graphql).
192    ///
193    /// # Example
194    ///
195    /// ```no_run
196    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
197    /// use stygian_graph::adapters::graphql::GraphQlService;
198    /// use std::sync::Arc;
199    ///
200    /// let gql = Arc::new(GraphQlService::new(Default::default(), None));
201    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None)
202    ///     .with_graphql_service(gql);
203    /// ```
204    #[must_use]
205    pub fn with_graphql_service(mut self, service: Arc<dyn ScrapingService>) -> Self {
206        self.graphql_service = Some(service);
207        self
208    }
209
210    /// Infer a JSON Schema by type-inspecting a JSON example value.
211    ///
212    /// Supports objects, arrays, strings, numbers, booleans, and null.
213    ///
214    /// # Example
215    ///
216    /// ```
217    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
218    /// use serde_json::json;
219    ///
220    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
221    /// let example = json!({"name": "Alice", "score": 9.5, "active": true});
222    /// let schema = svc.infer_from_example(&example);
223    ///
224    /// assert_eq!(schema["type"].as_str().unwrap(), "object");
225    /// assert_eq!(schema["properties"]["name"]["type"].as_str().unwrap(), "string");
226    /// assert_eq!(schema["properties"]["score"]["type"].as_str().unwrap(), "number");
227    /// ```
228    pub fn infer_from_example(&self, example: &Value) -> Value {
229        Self::infer_schema_for_value(example)
230    }
231
232    /// Recursively infer a JSON Schema for a single JSON value
233    fn infer_schema_for_value(value: &Value) -> Value {
234        match value {
235            Value::Object(map) => {
236                let mut properties = Map::new();
237                let mut required = Vec::new();
238                for (key, val) in map {
239                    properties.insert(key.clone(), Self::infer_schema_for_value(val));
240                    required.push(json!(key));
241                }
242                json!({
243                    "type": "object",
244                    "properties": Value::Object(properties),
245                    "required": required
246                })
247            }
248            Value::Array(items) => {
249                // Infer item schema from first element
250                let item_schema = items
251                    .first()
252                    .map_or_else(|| json!({}), Self::infer_schema_for_value);
253                json!({
254                    "type": "array",
255                    "items": item_schema
256                })
257            }
258            Value::String(_) => json!({"type": "string"}),
259            Value::Number(n) => {
260                if n.is_i64() || n.is_u64() {
261                    json!({"type": "integer"})
262                } else {
263                    json!({"type": "number"})
264                }
265            }
266            Value::Bool(_) => json!({"type": "boolean"}),
267            Value::Null => json!({"type": "null"}),
268        }
269    }
270
271    /// Infer a schema from HTML content using an AI provider.
272    ///
273    /// Returns an error if no AI provider is configured.
274    ///
275    /// # Example
276    ///
277    /// ```no_run
278    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
279    ///
280    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
281    /// let service = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
282    /// // Without an AI provider, this returns Err
283    /// let result = service.infer_from_html("<html>...</html>", None).await;
284    /// assert!(result.is_err());
285    /// # });
286    /// ```
287    pub async fn infer_from_html(
288        &self,
289        html: &str,
290        url_pattern: Option<&str>,
291    ) -> Result<DiscoveredSchema> {
292        // Check cache first
293        if self.config.cache_schemas
294            && let Some(pattern) = url_pattern
295        {
296            let cache = self.cache.read();
297            if let Some(cached) = cache.get(pattern) {
298                debug!(pattern, "Schema cache hit");
299                return Ok(DiscoveredSchema {
300                    source: SchemaSource::Cached,
301                    ..cached.clone()
302                });
303            }
304        }
305
306        let provider = self.ai_provider.as_ref().ok_or_else(|| {
307            StygianError::Provider(ProviderError::ApiError(
308                "No AI provider configured for HTML schema discovery".to_string(),
309            ))
310        })?;
311
312        let truncated = if html.len() > self.config.max_content_chars {
313            &html[..self.config.max_content_chars]
314        } else {
315            html
316        };
317
318        let discovery_schema = json!({
319            "type": "object",
320            "properties": {
321                "schema": {
322                    "type": "object",
323                    "description": "A valid JSON Schema object describing the structured data in the HTML"
324                },
325                "confidence": {
326                    "type": "number",
327                    "minimum": 0.0,
328                    "maximum": 1.0,
329                    "description": "Confidence in the inferred schema [0.0, 1.0]"
330                },
331                "description": {
332                    "type": "string",
333                    "description": "Human-readable explanation of what data was detected"
334                }
335            },
336            "required": ["schema", "confidence"]
337        });
338
339        let prompt = format!(
340            "Analyze the following HTML page and infer a JSON Schema for the main structured \
341             data it contains (e.g. product listings, articles, search results). \
342             Return the schema definition.\n\nHTML:\n{truncated}"
343        );
344
345        let result = provider.extract(prompt, discovery_schema).await?;
346
347        let schema = result
348            .get("schema")
349            .cloned()
350            .unwrap_or_else(|| json!({"type": "object"}));
351
352        #[allow(clippy::cast_possible_truncation)]
353        let confidence = result
354            .get("confidence")
355            .and_then(Value::as_f64)
356            .unwrap_or(0.7) as f32;
357
358        // Apply any user corrections
359        let schema = self.apply_corrections(schema);
360
361        let discovered = DiscoveredSchema {
362            schema,
363            source: SchemaSource::AiInference,
364            confidence,
365            url_pattern: url_pattern.map(str::to_string),
366        };
367
368        // Cache the result
369        if self.config.cache_schemas
370            && let Some(pattern) = url_pattern
371        {
372            self.cache
373                .write()
374                .insert(pattern.to_string(), discovered.clone());
375        }
376
377        Ok(discovered)
378    }
379
380    /// Apply user corrections to an inferred schema
381    fn apply_corrections(&self, mut schema: Value) -> Value {
382        let corrections = self.corrections.read();
383        if corrections.is_empty() {
384            return schema;
385        }
386
387        if let Some(props) = schema.get_mut("properties").and_then(Value::as_object_mut) {
388            for (field, correction) in corrections.iter() {
389                if props.contains_key(field) {
390                    props.insert(field.clone(), correction.clone());
391                }
392            }
393        }
394        schema
395    }
396
397    /// Register a user correction for a specific field type
398    ///
399    /// # Example
400    ///
401    /// ```
402    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
403    /// use serde_json::json;
404    ///
405    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
406    /// // Always use "integer" for "id" fields regardless of inferred type
407    /// svc.add_correction("id".to_string(), json!({"type": "integer"}));
408    /// ```
409    pub fn add_correction(&self, field_name: String, schema_fragment: Value) {
410        self.corrections.write().insert(field_name, schema_fragment);
411    }
412
413    /// Cache a schema for a URL pattern (useful for preloading known schemas)
414    ///
415    /// # Example
416    ///
417    /// ```
418    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig, SchemaSource};
419    /// use serde_json::json;
420    ///
421    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
422    /// svc.cache_schema(
423    ///     "example.com/products".to_string(),
424    ///     json!({"type": "object"}),
425    ///     SchemaSource::UserCorrection,
426    ///     1.0,
427    /// );
428    /// ```
429    pub fn cache_schema(
430        &self,
431        url_pattern: String,
432        schema: Value,
433        source: SchemaSource,
434        confidence: f32,
435    ) {
436        let discovered = DiscoveredSchema {
437            schema,
438            source,
439            confidence,
440            url_pattern: Some(url_pattern.clone()),
441        };
442        self.cache.write().insert(url_pattern, discovered);
443    }
444
445    /// Detect schema evolution between a known schema and a newly inferred one.
446    ///
447    /// Returns a diff describing added/removed/changed fields.
448    ///
449    /// # Example
450    ///
451    /// ```
452    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
453    /// use serde_json::json;
454    ///
455    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None);
456    ///
457    /// let old = json!({
458    ///     "type": "object",
459    ///     "properties": {"name": {"type": "string"}, "price": {"type": "number"}}
460    /// });
461    /// let new = json!({
462    ///     "type": "object",
463    ///     "properties": {"name": {"type": "string"}, "discount": {"type": "number"}}
464    /// });
465    ///
466    /// let diff = svc.detect_evolution(&old, &new);
467    /// assert!(diff.added_fields.contains(&"discount".to_string()));
468    /// assert!(diff.removed_fields.contains(&"price".to_string()));
469    /// ```
470    pub fn detect_evolution(&self, old_schema: &Value, new_schema: &Value) -> SchemaDiff {
471        let old_props = old_schema
472            .get("properties")
473            .and_then(Value::as_object)
474            .cloned()
475            .unwrap_or_default();
476        let new_props = new_schema
477            .get("properties")
478            .and_then(Value::as_object)
479            .cloned()
480            .unwrap_or_default();
481
482        let added_fields: Vec<String> = new_props
483            .keys()
484            .filter(|k| !old_props.contains_key(*k))
485            .cloned()
486            .collect();
487
488        let removed_fields: Vec<String> = old_props
489            .keys()
490            .filter(|k| !new_props.contains_key(*k))
491            .cloned()
492            .collect();
493
494        let type_changes: Vec<(String, String, String)> = old_props
495            .iter()
496            .filter_map(|(k, old_v)| {
497                let new_v = new_props.get(k)?;
498                let old_t = old_v.get("type").and_then(Value::as_str).unwrap_or("any");
499                let new_t = new_v.get("type").and_then(Value::as_str).unwrap_or("any");
500                (old_t != new_t).then(|| (k.clone(), old_t.to_string(), new_t.to_string()))
501            })
502            .collect();
503
504        let is_compatible = removed_fields.is_empty() && type_changes.is_empty();
505
506        SchemaDiff {
507            added_fields,
508            removed_fields,
509            type_changes,
510            is_compatible,
511        }
512    }
513
514    /// Discover a JSON Schema by introspecting a live GraphQL endpoint.
515    ///
516    /// Fires the standard `__schema` introspection query, converts the returned
517    /// SDL types to a JSON Schema representation, and caches the result by
518    /// endpoint URL.
519    ///
520    /// # Errors
521    ///
522    /// Returns `Err` if no GraphQL service is configured, if the endpoint is
523    /// unreachable, if the response contains GraphQL errors, or if the
524    /// introspection response cannot be parsed.
525    ///
526    /// # Example
527    ///
528    /// ```no_run
529    /// use stygian_graph::application::schema_discovery::{SchemaDiscoveryService, SchemaDiscoveryConfig};
530    /// use stygian_graph::adapters::graphql::GraphQlService;
531    /// use std::sync::Arc;
532    ///
533    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
534    /// let gql = Arc::new(GraphQlService::new(Default::default(), None));
535    /// let svc = SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None)
536    ///     .with_graphql_service(gql);
537    /// let schema = svc.infer_from_graphql("https://api.example.com/graphql", None).await;
538    /// # });
539    /// ```
540    #[allow(clippy::indexing_slicing)]
541    pub async fn infer_from_graphql(
542        &self,
543        endpoint: &str,
544        auth: Option<GraphQlAuth>,
545    ) -> Result<DiscoveredSchema> {
546        // ── 1. Cache check ────────────────────────────────────────────────
547        if self.config.cache_schemas {
548            let cache = self.cache.read();
549            if let Some(cached) = cache.get(endpoint) {
550                debug!(endpoint, "GraphQL introspection cache hit");
551                return Ok(DiscoveredSchema {
552                    source: SchemaSource::Cached,
553                    ..cached.clone()
554                });
555            }
556        }
557
558        // ── 2. Require a GraphQL service ──────────────────────────────────
559        let service = self.graphql_service.as_ref().ok_or_else(|| {
560            StygianError::Service(ServiceError::Unavailable(
561                "No GraphQL service configured for introspection".to_string(),
562            ))
563        })?;
564
565        // ── 3. Build ServiceInput with introspection query ────────────────
566        let mut params = json!({
567            "query": INTROSPECTION_QUERY,
568        });
569        if let Some(a) = auth {
570            params["auth"] = json!({
571                "kind": match a.kind {
572                    crate::ports::GraphQlAuthKind::Bearer => "bearer",
573                    crate::ports::GraphQlAuthKind::ApiKey => "api_key",
574                    crate::ports::GraphQlAuthKind::Header => "header",
575                    crate::ports::GraphQlAuthKind::None => "none",
576                },
577                "token": a.token,
578            });
579            if let Some(header) = a.header_name {
580                params["auth"]["header_name"] = json!(header);
581            }
582        }
583
584        let input = ServiceInput {
585            url: endpoint.to_string(),
586            params,
587        };
588
589        // ── 4. Execute ────────────────────────────────────────────────────
590        let output = service.execute(input).await?;
591
592        // ── 5. Parse __schema ─────────────────────────────────────────────
593        let response_json: Value = serde_json::from_str(&output.data)
594            .map_err(|e| StygianError::Service(ServiceError::InvalidResponse(e.to_string())))?;
595        let introspection = response_json.get("__schema").ok_or_else(|| {
596            StygianError::Service(ServiceError::InvalidResponse(
597                "introspection response missing __schema key".to_string(),
598            ))
599        })?;
600
601        // ── 6. Build JSON Schema from types ───────────────────────────────
602        let query_type_name = introspection
603            .get("queryType")
604            .and_then(|qt| qt.get("name"))
605            .and_then(Value::as_str)
606            .unwrap_or("Query")
607            .to_string();
608
609        let types = introspection
610            .get("types")
611            .and_then(Value::as_array)
612            .cloned()
613            .unwrap_or_default();
614
615        let mut definitions = Map::new();
616        for gql_type in &types {
617            let name = gql_type.get("name").and_then(Value::as_str).unwrap_or("");
618            // Skip built-in introspection types
619            if name.is_empty() || name.starts_with('_') {
620                continue;
621            }
622            let def = Self::convert_type_to_definition(gql_type);
623            definitions.insert(name.to_string(), def);
624        }
625
626        let schema = json!({
627            "type": "object",
628            "description": format!("GraphQL schema for {endpoint}"),
629            "definitions": Value::Object(definitions),
630            "queryType": query_type_name,
631        });
632
633        let discovered = DiscoveredSchema {
634            schema,
635            source: SchemaSource::AiInference,
636            confidence: 1.0,
637            url_pattern: Some(endpoint.to_string()),
638        };
639
640        // ── 7. Cache ───────────────────────────────────────────────────────
641        if self.config.cache_schemas {
642            self.cache
643                .write()
644                .insert(endpoint.to_string(), discovered.clone());
645        }
646
647        Ok(discovered)
648    }
649
650    /// Convert a top-level GraphQL type definition to a JSON Schema object.
651    #[allow(clippy::indexing_slicing)]
652    fn convert_type_to_definition(gql_type: &Value) -> Value {
653        let kind = gql_type.get("kind").and_then(Value::as_str).unwrap_or("");
654        match kind {
655            "OBJECT" | "INTERFACE" | "INPUT_OBJECT" => {
656                let fields: Vec<Value> = gql_type
657                    .get("fields")
658                    .or_else(|| gql_type.get("inputFields"))
659                    .and_then(Value::as_array)
660                    .cloned()
661                    .unwrap_or_default();
662
663                let mut properties = Map::new();
664                let mut required: Vec<Value> = Vec::new();
665
666                for field in &fields {
667                    let field_name = field.get("name").and_then(Value::as_str).unwrap_or("");
668                    if field_name.is_empty() {
669                        continue;
670                    }
671                    let type_ref = &field["type"];
672                    // NON_NULL at the outermost level means the field is required
673                    if type_ref.get("kind").and_then(Value::as_str) == Some("NON_NULL") {
674                        required.push(json!(field_name));
675                    }
676                    properties.insert(field_name.to_string(), Self::convert_type_ref(type_ref));
677                }
678
679                let mut obj = json!({
680                    "type": "object",
681                    "properties": Value::Object(properties),
682                });
683                if !required.is_empty() {
684                    obj["required"] = json!(required);
685                }
686                obj
687            }
688            "ENUM" => {
689                let values: Vec<Value> = gql_type
690                    .get("enumValues")
691                    .and_then(Value::as_array)
692                    .map(|vals| {
693                        vals.iter()
694                            .filter_map(|v| v.get("name").and_then(Value::as_str))
695                            .map(|s| json!(s))
696                            .collect()
697                    })
698                    .unwrap_or_default();
699                json!({ "type": "string", "enum": values })
700            }
701            "SCALAR" => {
702                let name = gql_type.get("name").and_then(Value::as_str).unwrap_or("");
703                Self::scalar_to_json_schema(name)
704            }
705            _ => json!({}),
706        }
707    }
708
709    /// Recursively convert a GraphQL type reference (field type) to JSON Schema.
710    fn convert_type_ref(type_ref: &Value) -> Value {
711        let kind = type_ref.get("kind").and_then(Value::as_str).unwrap_or("");
712        match kind {
713            "NON_NULL" => Self::convert_type_ref(&type_ref["ofType"]),
714            "LIST" => json!({
715                "type": "array",
716                "items": Self::convert_type_ref(&type_ref["ofType"]),
717            }),
718            "OBJECT" | "INTERFACE" | "INPUT_OBJECT" | "ENUM" => {
719                let name = type_ref.get("name").and_then(Value::as_str).unwrap_or("");
720                json!({ "$ref": format!("#/definitions/{name}") })
721            }
722            "SCALAR" => {
723                let name = type_ref.get("name").and_then(Value::as_str).unwrap_or("");
724                Self::scalar_to_json_schema(name)
725            }
726            _ => json!({}),
727        }
728    }
729
730    /// Map a GraphQL scalar name to its JSON Schema type.
731    fn scalar_to_json_schema(name: &str) -> Value {
732        match name {
733            "String" | "ID" => json!({ "type": "string" }),
734            "Int" => json!({ "type": "integer" }),
735            "Float" => json!({ "type": "number" }),
736            "Boolean" => json!({ "type": "boolean" }),
737            s if s.to_ascii_lowercase().contains("datetime")
738                || s.to_ascii_lowercase().contains("date") =>
739            {
740                json!({ "type": "string", "format": "date-time" })
741            }
742            _ => json!({}),
743        }
744    }
745
746    /// Validate that a value is a structurally valid JSON Schema object.
747    ///
748    /// Checks for the presence of a `type` or `properties` field.
749    ///
750    /// # Example
751    ///
752    /// ```
753    /// use stygian_graph::application::schema_discovery::SchemaDiscoveryService;
754    /// use serde_json::json;
755    ///
756    /// assert!(SchemaDiscoveryService::is_valid_schema(&json!({"type": "object"})));
757    /// assert!(!SchemaDiscoveryService::is_valid_schema(&json!("not a schema")));
758    /// ```
759    pub fn is_valid_schema(schema: &Value) -> bool {
760        if !schema.is_object() {
761            return false;
762        }
763        schema.get("type").is_some() || schema.get("properties").is_some()
764    }
765}
766
767#[cfg(test)]
768#[allow(
769    clippy::unwrap_used,
770    clippy::indexing_slicing,
771    clippy::approx_constant,
772    clippy::significant_drop_tightening,
773    clippy::float_cmp,
774    clippy::unnecessary_map_or
775)]
776mod tests {
777    use super::*;
778    use serde_json::json;
779
780    fn svc() -> SchemaDiscoveryService {
781        SchemaDiscoveryService::new(SchemaDiscoveryConfig::default(), None)
782    }
783
784    // --- infer_from_example ---
785
786    #[test]
787    fn test_infer_object_schema() {
788        let s = svc();
789        let schema = s.infer_from_example(&json!({"name": "Alice", "age": 30, "active": true}));
790        assert_eq!(schema["type"].as_str().unwrap(), "object");
791        assert_eq!(
792            schema["properties"]["name"]["type"].as_str().unwrap(),
793            "string"
794        );
795        assert_eq!(
796            schema["properties"]["age"]["type"].as_str().unwrap(),
797            "integer"
798        );
799        assert_eq!(
800            schema["properties"]["active"]["type"].as_str().unwrap(),
801            "boolean"
802        );
803    }
804
805    #[test]
806    fn test_infer_array_schema() {
807        let s = svc();
808        let schema = s.infer_from_example(&json!([{"id": 1}, {"id": 2}]));
809        assert_eq!(schema["type"].as_str().unwrap(), "array");
810        assert_eq!(schema["items"]["type"].as_str().unwrap(), "object");
811    }
812
813    #[test]
814    fn test_infer_string_schema() {
815        let s = svc();
816        let schema = s.infer_from_example(&json!("hello"));
817        assert_eq!(schema["type"].as_str().unwrap(), "string");
818    }
819
820    #[test]
821    fn test_infer_number_float() {
822        let s = svc();
823        let schema = s.infer_from_example(&json!(3.14));
824        assert_eq!(schema["type"].as_str().unwrap(), "number");
825    }
826
827    #[test]
828    fn test_infer_null() {
829        let s = svc();
830        let schema = s.infer_from_example(&Value::Null);
831        assert_eq!(schema["type"].as_str().unwrap(), "null");
832    }
833
834    // --- is_valid_schema ---
835
836    #[test]
837    fn test_valid_schema_with_type() {
838        assert!(SchemaDiscoveryService::is_valid_schema(
839            &json!({"type": "object"})
840        ));
841    }
842
843    #[test]
844    fn test_valid_schema_with_properties() {
845        assert!(SchemaDiscoveryService::is_valid_schema(
846            &json!({"properties": {"x": {"type": "string"}}})
847        ));
848    }
849
850    #[test]
851    fn test_invalid_schema_string() {
852        assert!(!SchemaDiscoveryService::is_valid_schema(&json!(
853            "not schema"
854        )));
855    }
856
857    #[test]
858    fn test_invalid_schema_empty_object() {
859        // An empty object has neither "type" nor "properties"
860        assert!(!SchemaDiscoveryService::is_valid_schema(&json!({})));
861    }
862
863    // --- detect_evolution ---
864
865    #[test]
866    fn test_evolution_added_field() {
867        let s = svc();
868        let old = json!({"type": "object", "properties": {"name": {"type": "string"}}});
869        let new = json!({"type": "object", "properties": {"name": {"type": "string"}, "email": {"type": "string"}}});
870        let diff = s.detect_evolution(&old, &new);
871        assert!(diff.added_fields.contains(&"email".to_string()));
872        assert!(diff.removed_fields.is_empty());
873        assert!(diff.is_compatible);
874    }
875
876    #[test]
877    fn test_evolution_removed_field() {
878        let s = svc();
879        let old = json!({"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}});
880        let new = json!({"type": "object", "properties": {"name": {"type": "string"}}});
881        let diff = s.detect_evolution(&old, &new);
882        assert!(diff.removed_fields.contains(&"age".to_string()));
883        assert!(!diff.is_compatible);
884    }
885
886    #[test]
887    fn test_evolution_type_change() {
888        let s = svc();
889        let old = json!({"type": "object", "properties": {"id": {"type": "string"}}});
890        let new = json!({"type": "object", "properties": {"id": {"type": "integer"}}});
891        let diff = s.detect_evolution(&old, &new);
892        assert_eq!(diff.type_changes.len(), 1);
893        assert_eq!(
894            diff.type_changes[0],
895            (
896                "id".to_string(),
897                "string".to_string(),
898                "integer".to_string()
899            )
900        );
901        assert!(!diff.is_compatible);
902    }
903
904    #[test]
905    fn test_evolution_no_change() {
906        let s = svc();
907        let schema = json!({"type": "object", "properties": {"x": {"type": "string"}}});
908        let diff = s.detect_evolution(&schema, &schema);
909        assert!(diff.added_fields.is_empty());
910        assert!(diff.removed_fields.is_empty());
911        assert!(diff.is_compatible);
912    }
913
914    // --- corrections ---
915
916    #[test]
917    fn test_correction_overrides_inferred_type() {
918        let s = svc();
919        s.add_correction("id".to_string(), json!({"type": "integer"}));
920
921        let schema = json!({
922            "type": "object",
923            "properties": {
924                "id": {"type": "string"},
925                "name": {"type": "string"}
926            }
927        });
928        let corrected = s.apply_corrections(schema);
929        assert_eq!(
930            corrected["properties"]["id"]["type"].as_str().unwrap(),
931            "integer"
932        );
933        assert_eq!(
934            corrected["properties"]["name"]["type"].as_str().unwrap(),
935            "string"
936        );
937    }
938
939    // --- cache ---
940
941    #[test]
942    fn test_cache_and_retrieve() {
943        let s = svc();
944        let schema = json!({"type": "object", "properties": {"title": {"type": "string"}}});
945        s.cache_schema(
946            "example.com".to_string(),
947            schema.clone(),
948            SchemaSource::UserCorrection,
949            1.0,
950        );
951
952        let cache = s.cache.read();
953        let entry = cache.get("example.com").unwrap();
954        assert_eq!(entry.schema, schema);
955        assert_eq!(entry.source, SchemaSource::UserCorrection);
956    }
957
958    // --- HTML inference without provider ---
959
960    #[tokio::test]
961    async fn test_infer_from_html_no_provider() {
962        let s = svc();
963        let result = s
964            .infer_from_html("<html><body>test</body></html>", None)
965            .await;
966        assert!(result.is_err());
967        assert!(result.unwrap_err().to_string().contains("No AI provider"));
968    }
969
970    // --- HTML inference with cache hit ---
971
972    #[tokio::test]
973    async fn test_infer_from_html_cache_hit() {
974        let s = svc();
975        let schema = json!({"type": "object"});
976        s.cache_schema(
977            "example.com".to_string(),
978            schema.clone(),
979            SchemaSource::Cached,
980            0.9,
981        );
982
983        let result = s
984            .infer_from_html("<html/>", Some("example.com"))
985            .await
986            .unwrap();
987        assert_eq!(result.source, SchemaSource::Cached);
988        assert_eq!(result.schema, schema);
989    }
990
991    // ── GraphQL introspection tests ──────────────────────────────────────────
992
993    /// Minimal mock [`ScrapingService`] that returns a pre-baked `Value` as `data`.
994    struct MockGraphQlService(Value);
995
996    #[async_trait::async_trait]
997    impl ScrapingService for MockGraphQlService {
998        fn name(&self) -> &'static str {
999            "mock_graphql"
1000        }
1001        async fn execute(
1002            &self,
1003            _input: crate::ports::ServiceInput,
1004        ) -> Result<crate::ports::ServiceOutput> {
1005            Ok(crate::ports::ServiceOutput {
1006                data: serde_json::to_string(&self.0).unwrap_or_default(),
1007                metadata: Value::default(),
1008            })
1009        }
1010    }
1011
1012    impl SchemaDiscoveryService {
1013        /// Test-only constructor: pre-bake a GraphQL introspection response.
1014        ///
1015        /// The `response` value should represent the `data` object that a real
1016        /// GraphQL service would return (i.e. `{ "__schema": { ... } }`).
1017        fn with_mock_graphql(config: SchemaDiscoveryConfig, response: Value) -> Self {
1018            let svc = Arc::new(MockGraphQlService(response));
1019            Self::new(config, None).with_graphql_service(svc)
1020        }
1021    }
1022
1023    fn make_introspection_response_simple() -> Value {
1024        json!({
1025            "__schema": {
1026                "queryType": { "name": "Query" },
1027                "mutationType": null,
1028                "subscriptionType": null,
1029                "types": [
1030                    {
1031                        "kind": "OBJECT",
1032                        "name": "Query",
1033                        "fields": [
1034                            {
1035                                "name": "user",
1036                                "type": { "kind": "OBJECT", "name": "User", "ofType": null }
1037                            }
1038                        ],
1039                        "inputFields": null,
1040                        "interfaces": [],
1041                        "enumValues": null,
1042                        "possibleTypes": null
1043                    },
1044                    {
1045                        "kind": "OBJECT",
1046                        "name": "User",
1047                        "fields": [
1048                            {
1049                                "name": "id",
1050                                "type": { "kind": "NON_NULL", "name": null, "ofType": { "kind": "SCALAR", "name": "ID", "ofType": null } }
1051                            },
1052                            {
1053                                "name": "name",
1054                                "type": { "kind": "SCALAR", "name": "String", "ofType": null }
1055                            }
1056                        ],
1057                        "inputFields": null,
1058                        "interfaces": [],
1059                        "enumValues": null,
1060                        "possibleTypes": null
1061                    },
1062                    {
1063                        "kind": "SCALAR",
1064                        "name": "String",
1065                        "fields": null,
1066                        "inputFields": null,
1067                        "interfaces": null,
1068                        "enumValues": null,
1069                        "possibleTypes": null
1070                    },
1071                    {
1072                        "kind": "SCALAR",
1073                        "name": "ID",
1074                        "fields": null,
1075                        "inputFields": null,
1076                        "interfaces": null,
1077                        "enumValues": null,
1078                        "possibleTypes": null
1079                    }
1080                ],
1081                "directives": []
1082            }
1083        })
1084    }
1085
1086    #[tokio::test]
1087    async fn introspection_no_graphql_service() {
1088        let s = svc(); // no graphql_service configured
1089        let result = s
1090            .infer_from_graphql("https://api.example.com/graphql", None)
1091            .await;
1092        assert!(result.is_err());
1093        assert!(
1094            result
1095                .unwrap_err()
1096                .to_string()
1097                .contains("No GraphQL service")
1098        );
1099    }
1100
1101    #[test]
1102    fn graphql_type_to_json_schema_scalar() {
1103        assert_eq!(
1104            SchemaDiscoveryService::scalar_to_json_schema("String"),
1105            json!({"type": "string"})
1106        );
1107        assert_eq!(
1108            SchemaDiscoveryService::scalar_to_json_schema("Int"),
1109            json!({"type": "integer"})
1110        );
1111        assert_eq!(
1112            SchemaDiscoveryService::scalar_to_json_schema("Float"),
1113            json!({"type": "number"})
1114        );
1115        assert_eq!(
1116            SchemaDiscoveryService::scalar_to_json_schema("Boolean"),
1117            json!({"type": "boolean"})
1118        );
1119        assert_eq!(
1120            SchemaDiscoveryService::scalar_to_json_schema("ID"),
1121            json!({"type": "string"})
1122        );
1123        assert_eq!(
1124            SchemaDiscoveryService::scalar_to_json_schema("DateTime"),
1125            json!({"type": "string", "format": "date-time"})
1126        );
1127        // Unknown scalar → permissive empty schema
1128        assert_eq!(
1129            SchemaDiscoveryService::scalar_to_json_schema("JSON"),
1130            json!({})
1131        );
1132    }
1133
1134    #[test]
1135    fn graphql_type_to_json_schema_object() {
1136        let gql_type = json!({
1137            "kind": "OBJECT",
1138            "name": "User",
1139            "fields": [
1140                {
1141                    "name": "id",
1142                    "type": { "kind": "SCALAR", "name": "ID", "ofType": null }
1143                },
1144                {
1145                    "name": "email",
1146                    "type": { "kind": "SCALAR", "name": "String", "ofType": null }
1147                }
1148            ]
1149        });
1150        let schema = SchemaDiscoveryService::convert_type_to_definition(&gql_type);
1151        assert_eq!(schema["type"].as_str().unwrap(), "object");
1152        assert_eq!(
1153            schema["properties"]["id"]["type"].as_str().unwrap(),
1154            "string"
1155        );
1156        assert_eq!(
1157            schema["properties"]["email"]["type"].as_str().unwrap(),
1158            "string"
1159        );
1160        // Neither field is NON_NULL, so no required array
1161        assert!(
1162            schema.get("required").is_none()
1163                || schema["required"].as_array().is_none_or(Vec::is_empty)
1164        );
1165    }
1166
1167    #[test]
1168    fn graphql_non_null_adds_to_required() {
1169        let gql_type = json!({
1170            "kind": "OBJECT",
1171            "name": "Product",
1172            "fields": [
1173                {
1174                    "name": "sku",
1175                    "type": {
1176                        "kind": "NON_NULL",
1177                        "name": null,
1178                        "ofType": { "kind": "SCALAR", "name": "String", "ofType": null }
1179                    }
1180                },
1181                {
1182                    "name": "description",
1183                    "type": { "kind": "SCALAR", "name": "String", "ofType": null }
1184                }
1185            ]
1186        });
1187        let schema = SchemaDiscoveryService::convert_type_to_definition(&gql_type);
1188        let required = schema["required"].as_array().unwrap();
1189        assert!(required.contains(&json!("sku")));
1190        assert!(!required.contains(&json!("description")));
1191    }
1192
1193    #[tokio::test]
1194    async fn introspection_result_cached() {
1195        let response = make_introspection_response_simple();
1196        let s =
1197            SchemaDiscoveryService::with_mock_graphql(SchemaDiscoveryConfig::default(), response);
1198
1199        // First call — fetches from mock service
1200        let first = s
1201            .infer_from_graphql("https://api.example.com/graphql", None)
1202            .await
1203            .unwrap();
1204        assert_eq!(first.source, SchemaSource::AiInference);
1205        assert_eq!(first.confidence, 1.0);
1206
1207        // Second call — served from cache (source changes to Cached)
1208        let second = s
1209            .infer_from_graphql("https://api.example.com/graphql", None)
1210            .await
1211            .unwrap();
1212        assert_eq!(second.source, SchemaSource::Cached);
1213        assert_eq!(second.schema["queryType"].as_str().unwrap(), "Query");
1214    }
1215}