use std::collections::HashMap; use std::path::Path; use chrono::Utc; use compliance_core::error::CoreError; use compliance_core::models::graph::{ CodeEdge, CodeEdgeKind, CodeNode, GraphBuildRun, GraphBuildStatus, }; use compliance_core::traits::graph_builder::ParseOutput; use petgraph::graph::{DiGraph, NodeIndex}; use tracing::info; use crate::parsers::registry::ParserRegistry; use super::community::detect_communities; use super::impact::ImpactAnalyzer; /// Walk up the qualified-name hierarchy to find the closest ancestor /// that exists in the node map. /// /// For `"src/main.rs::config::load"` this tries: /// 1. `"src/main.rs::config"` (trim last `::` segment) /// 2. `"src/main.rs"` (trim again) /// /// Returns the first match found, or `None` if the node is a root. fn find_parent_qname(qname: &str, node_map: &HashMap) -> Option { let mut current = qname.to_string(); loop { // Try stripping the last "::" segment if let Some(pos) = current.rfind("::") { current.truncate(pos); if node_map.contains_key(¤t) { return Some(current); } continue; } // No more "::" — this is a top-level node (file), no parent return None; } } /// The main graph engine that builds and manages code knowledge graphs pub struct GraphEngine { parser_registry: ParserRegistry, max_nodes: u32, } /// In-memory representation of a built code graph pub struct CodeGraph { pub graph: DiGraph, pub node_map: HashMap, pub nodes: Vec, pub edges: Vec, } impl GraphEngine { pub fn new(max_nodes: u32) -> Self { Self { parser_registry: ParserRegistry::new(), max_nodes, } } /// Build a code graph from a repository directory pub fn build_graph( &self, repo_path: &Path, repo_id: &str, graph_build_id: &str, ) -> Result<(CodeGraph, GraphBuildRun), CoreError> { let mut build_run = GraphBuildRun::new(repo_id.to_string()); info!(repo_id, path = %repo_path.display(), "Starting graph build"); // Phase 1: Parse all files let parse_output = self.parser_registry.parse_directory( repo_path, repo_id, graph_build_id, self.max_nodes, )?; // Phase 2: Build petgraph let code_graph = self.build_petgraph(parse_output)?; // Phase 3: Run community detection let community_count = detect_communities(&code_graph); // Collect language stats let mut languages: Vec = code_graph .nodes .iter() .map(|n| n.language.clone()) .collect::>() .into_iter() .collect(); languages.sort(); build_run.node_count = code_graph.nodes.len() as u32; build_run.edge_count = code_graph.edges.len() as u32; build_run.community_count = community_count; build_run.languages_parsed = languages; build_run.status = GraphBuildStatus::Completed; build_run.completed_at = Some(Utc::now()); info!( nodes = build_run.node_count, edges = build_run.edge_count, communities = build_run.community_count, "Graph build complete" ); Ok((code_graph, build_run)) } /// Build petgraph from parsed output, resolving edges to node indices. /// /// After resolving the explicit edges from parsers, we synthesise /// `Contains` edges so that every node is reachable from its parent /// file or module. This eliminates disconnected "islands" that /// otherwise appear when files share no direct call/import edges. fn build_petgraph(&self, parse_output: ParseOutput) -> Result { let mut graph = DiGraph::new(); let mut node_map: HashMap = HashMap::new(); let mut nodes = parse_output.nodes; // Add all nodes to the graph for node in &mut nodes { let idx = graph.add_node(node.qualified_name.clone()); node.graph_index = Some(idx.index() as u32); node_map.insert(node.qualified_name.clone(), idx); } // Resolve and add explicit edges from parsers let mut resolved_edges = Vec::new(); for mut edge in parse_output.edges { let source_idx = node_map.get(&edge.source); let resolved = self.resolve_edge_target(&edge.target, &node_map); if let (Some(&src), Some(tgt)) = (source_idx, resolved) { let resolved_name = node_map .iter() .find(|(_, &idx)| idx == tgt) .map(|(name, _)| name.clone()); if let Some(name) = resolved_name { edge.target = name; } graph.add_edge(src, tgt, edge.kind.clone()); resolved_edges.push(edge); } } // Synthesise Contains edges: connect each node to its closest // parent in the qualified-name hierarchy. // // For "src/main.rs::config::load", the parent chain is: // "src/main.rs::config" → "src/main.rs" // // We walk up the qualified name (splitting on "::") and link to // the first ancestor that exists in the node map. let repo_id = nodes.first().map(|n| n.repo_id.as_str()).unwrap_or(""); let build_id = nodes .first() .map(|n| n.graph_build_id.as_str()) .unwrap_or(""); let qualified_names: Vec = nodes.iter().map(|n| n.qualified_name.clone()).collect(); let file_paths: HashMap = nodes .iter() .map(|n| (n.qualified_name.clone(), n.file_path.clone())) .collect(); for qname in &qualified_names { if let Some(parent_qname) = find_parent_qname(qname, &node_map) { let child_idx = node_map[qname]; let parent_idx = node_map[&parent_qname]; // Avoid duplicate edges if !graph.contains_edge(parent_idx, child_idx) { graph.add_edge(parent_idx, child_idx, CodeEdgeKind::Contains); resolved_edges.push(CodeEdge { id: None, repo_id: repo_id.to_string(), graph_build_id: build_id.to_string(), source: parent_qname, target: qname.clone(), kind: CodeEdgeKind::Contains, file_path: file_paths.get(qname).cloned().unwrap_or_default(), line_number: None, }); } } } Ok(CodeGraph { graph, node_map, nodes, edges: resolved_edges, }) } /// Try to resolve an edge target to a known node. /// /// Resolution strategies (in order): /// 1. Direct qualified-name match /// 2. Suffix match: "foo" matches "src/main.rs::mod::foo" /// 3. Module-path match: "config::load" matches "src/config.rs::load" /// 4. Self-method: "self.method" matches "::method" fn resolve_edge_target( &self, target: &str, node_map: &HashMap, ) -> Option { // 1. Direct match if let Some(idx) = node_map.get(target) { return Some(*idx); } // 2. Suffix match: "foo" → "path/file.rs::foo" let suffix_pattern = format!("::{target}"); let dot_pattern = format!(".{target}"); for (qualified, idx) in node_map { if qualified.ends_with(&suffix_pattern) || qualified.ends_with(&dot_pattern) { return Some(*idx); } } // 3. Module-path match: "config::load" → try matching the last N // segments of the target against node qualified names. // This handles cross-file calls like `crate::config::load` or // `super::handlers::process` where the prefix differs. if target.contains("::") { // Strip common Rust path prefixes let stripped = target .strip_prefix("crate::") .or_else(|| target.strip_prefix("super::")) .or_else(|| target.strip_prefix("self::")) .unwrap_or(target); let segments: Vec<&str> = stripped.split("::").collect(); // Try matching progressively shorter suffixes for start in 0..segments.len() { let suffix = segments[start..].join("::"); let pattern = format!("::{suffix}"); for (qualified, idx) in node_map { if qualified.ends_with(&pattern) { return Some(*idx); } } } } // 4. Self-method: "self.method" → "::method" if let Some(method_name) = target.strip_prefix("self.") { let pattern = format!("::{method_name}"); for (qualified, idx) in node_map { if qualified.ends_with(&pattern) { return Some(*idx); } } } None } /// Get the impact analyzer for a built graph pub fn impact_analyzer(code_graph: &CodeGraph) -> ImpactAnalyzer<'_> { ImpactAnalyzer::new(code_graph) } } #[cfg(test)] mod tests { use super::*; use compliance_core::models::graph::{CodeEdgeKind, CodeNode, CodeNodeKind}; fn make_node(qualified_name: &str) -> CodeNode { CodeNode { id: None, repo_id: "test".to_string(), graph_build_id: "build1".to_string(), qualified_name: qualified_name.to_string(), name: qualified_name .split("::") .last() .unwrap_or(qualified_name) .to_string(), kind: CodeNodeKind::Function, file_path: "src/main.rs".to_string(), start_line: 1, end_line: 10, language: "rust".to_string(), community_id: None, is_entry_point: false, graph_index: None, } } fn build_test_node_map(names: &[&str]) -> HashMap { let mut graph: DiGraph = DiGraph::new(); let mut map = HashMap::new(); for name in names { let idx = graph.add_node(name.to_string()); map.insert(name.to_string(), idx); } map } #[test] fn test_resolve_edge_target_direct_match() { let engine = GraphEngine::new(1000); let node_map = build_test_node_map(&["src/main.rs::foo", "src/main.rs::bar"]); let result = engine.resolve_edge_target("src/main.rs::foo", &node_map); assert!(result.is_some()); assert_eq!(result.unwrap(), node_map["src/main.rs::foo"]); } #[test] fn test_resolve_edge_target_short_name_match() { let engine = GraphEngine::new(1000); let node_map = build_test_node_map(&["src/main.rs::foo", "src/main.rs::bar"]); let result = engine.resolve_edge_target("foo", &node_map); assert!(result.is_some()); assert_eq!(result.unwrap(), node_map["src/main.rs::foo"]); } #[test] fn test_resolve_edge_target_method_match() { let engine = GraphEngine::new(1000); let node_map = build_test_node_map(&["src/main.rs::MyStruct::do_thing"]); let result = engine.resolve_edge_target("do_thing", &node_map); assert!(result.is_some()); } #[test] fn test_resolve_edge_target_self_method() { let engine = GraphEngine::new(1000); let node_map = build_test_node_map(&["src/main.rs::MyStruct::process"]); let result = engine.resolve_edge_target("self.process", &node_map); assert!(result.is_some()); } #[test] fn test_resolve_edge_target_no_match() { let engine = GraphEngine::new(1000); let node_map = build_test_node_map(&["src/main.rs::foo"]); let result = engine.resolve_edge_target("nonexistent", &node_map); assert!(result.is_none()); } #[test] fn test_resolve_edge_target_empty_map() { let engine = GraphEngine::new(1000); let node_map = HashMap::new(); let result = engine.resolve_edge_target("anything", &node_map); assert!(result.is_none()); } #[test] fn test_resolve_edge_target_dot_notation() { let engine = GraphEngine::new(1000); let node_map = build_test_node_map(&["src/app.js.handler"]); let result = engine.resolve_edge_target("handler", &node_map); assert!(result.is_some()); } #[test] fn test_build_petgraph_empty() { let engine = GraphEngine::new(1000); let output = ParseOutput::default(); let code_graph = engine.build_petgraph(output).unwrap(); assert_eq!(code_graph.nodes.len(), 0); assert_eq!(code_graph.edges.len(), 0); assert_eq!(code_graph.graph.node_count(), 0); } #[test] fn test_build_petgraph_nodes_get_graph_index() { let engine = GraphEngine::new(1000); let mut output = ParseOutput::default(); output.nodes.push(make_node("src/main.rs::foo")); output.nodes.push(make_node("src/main.rs::bar")); let code_graph = engine.build_petgraph(output).unwrap(); assert_eq!(code_graph.nodes.len(), 2); assert_eq!(code_graph.graph.node_count(), 2); // All nodes should have a graph_index assigned for node in &code_graph.nodes { assert!(node.graph_index.is_some()); } } #[test] fn test_build_petgraph_resolves_edges() { let engine = GraphEngine::new(1000); let mut output = ParseOutput::default(); output.nodes.push(make_node("src/main.rs::foo")); output.nodes.push(make_node("src/main.rs::bar")); output.edges.push(CodeEdge { id: None, repo_id: "test".to_string(), graph_build_id: "build1".to_string(), source: "src/main.rs::foo".to_string(), target: "bar".to_string(), // short name, should resolve kind: CodeEdgeKind::Calls, file_path: "src/main.rs".to_string(), line_number: Some(5), }); let code_graph = engine.build_petgraph(output).unwrap(); assert_eq!(code_graph.edges.len(), 1); assert_eq!(code_graph.graph.edge_count(), 1); // The resolved edge target should be the full qualified name assert_eq!(code_graph.edges[0].target, "src/main.rs::bar"); } #[test] fn test_build_petgraph_skips_unresolved_edges() { let engine = GraphEngine::new(1000); let mut output = ParseOutput::default(); output.nodes.push(make_node("src/main.rs::foo")); output.edges.push(CodeEdge { id: None, repo_id: "test".to_string(), graph_build_id: "build1".to_string(), source: "src/main.rs::foo".to_string(), target: "external_crate::something".to_string(), kind: CodeEdgeKind::Calls, file_path: "src/main.rs".to_string(), line_number: Some(5), }); let code_graph = engine.build_petgraph(output).unwrap(); assert_eq!(code_graph.edges.len(), 0); assert_eq!(code_graph.graph.edge_count(), 0); } #[test] fn test_code_graph_node_map_consistency() { let engine = GraphEngine::new(1000); let mut output = ParseOutput::default(); output.nodes.push(make_node("a::b")); output.nodes.push(make_node("a::c")); output.nodes.push(make_node("a::d")); let code_graph = engine.build_petgraph(output).unwrap(); assert_eq!(code_graph.node_map.len(), 3); assert!(code_graph.node_map.contains_key("a::b")); assert!(code_graph.node_map.contains_key("a::c")); assert!(code_graph.node_map.contains_key("a::d")); } #[test] fn test_contains_edges_synthesised() { let engine = GraphEngine::new(1000); let mut output = ParseOutput::default(); // File → Module → Function hierarchy output.nodes.push(make_node("src/main.rs")); output.nodes.push(make_node("src/main.rs::config")); output.nodes.push(make_node("src/main.rs::config::load")); let code_graph = engine.build_petgraph(output).unwrap(); // Should have 2 Contains edges: // src/main.rs → src/main.rs::config // src/main.rs::config → src/main.rs::config::load let contains_edges: Vec<_> = code_graph .edges .iter() .filter(|e| matches!(e.kind, CodeEdgeKind::Contains)) .collect(); assert_eq!(contains_edges.len(), 2, "expected 2 Contains edges"); let sources: Vec<&str> = contains_edges.iter().map(|e| e.source.as_str()).collect(); assert!(sources.contains(&"src/main.rs")); assert!(sources.contains(&"src/main.rs::config")); } #[test] fn test_contains_edges_no_duplicates_with_existing_edges() { let engine = GraphEngine::new(1000); let mut output = ParseOutput::default(); output.nodes.push(make_node("src/main.rs")); output.nodes.push(make_node("src/main.rs::foo")); // Explicit Calls edge (foo calls itself? just for testing) output.edges.push(CodeEdge { id: None, repo_id: "test".to_string(), graph_build_id: "build1".to_string(), source: "src/main.rs::foo".to_string(), target: "src/main.rs::foo".to_string(), kind: CodeEdgeKind::Calls, file_path: "src/main.rs".to_string(), line_number: Some(1), }); let code_graph = engine.build_petgraph(output).unwrap(); // 1 Calls + 1 Contains = 2 edges total assert_eq!(code_graph.edges.len(), 2); } #[test] fn test_cross_file_resolution_with_module_path() { let engine = GraphEngine::new(1000); let node_map = build_test_node_map(&["src/config.rs::load_config", "src/main.rs::main"]); // "crate::config::load_config" should resolve to "src/config.rs::load_config" let result = engine.resolve_edge_target("crate::config::load_config", &node_map); assert!(result.is_some(), "cross-file crate:: path should resolve"); } #[test] fn test_find_parent_qname() { let node_map = build_test_node_map(&[ "src/main.rs", "src/main.rs::config", "src/main.rs::config::load", ]); assert_eq!( find_parent_qname("src/main.rs::config::load", &node_map), Some("src/main.rs::config".to_string()) ); assert_eq!( find_parent_qname("src/main.rs::config", &node_map), Some("src/main.rs".to_string()) ); assert_eq!(find_parent_qname("src/main.rs", &node_map), None); } }