533 lines
19 KiB
Rust
533 lines
19 KiB
Rust
use std::collections::HashMap;
|
|
use std::path::Path;
|
|
|
|
use chrono::Utc;
|
|
use compliance_core::error::CoreError;
|
|
use compliance_core::models::graph::{
|
|
CodeEdge, CodeEdgeKind, CodeNode, GraphBuildRun, GraphBuildStatus,
|
|
};
|
|
use compliance_core::traits::graph_builder::ParseOutput;
|
|
use petgraph::graph::{DiGraph, NodeIndex};
|
|
use tracing::info;
|
|
|
|
use crate::parsers::registry::ParserRegistry;
|
|
|
|
use super::community::detect_communities;
|
|
use super::impact::ImpactAnalyzer;
|
|
|
|
/// Walk up the qualified-name hierarchy to find the closest ancestor
|
|
/// that exists in the node map.
|
|
///
|
|
/// For `"src/main.rs::config::load"` this tries:
|
|
/// 1. `"src/main.rs::config"` (trim last `::` segment)
|
|
/// 2. `"src/main.rs"` (trim again)
|
|
///
|
|
/// Returns the first match found, or `None` if the node is a root.
|
|
fn find_parent_qname(qname: &str, node_map: &HashMap<String, NodeIndex>) -> Option<String> {
|
|
let mut current = qname.to_string();
|
|
loop {
|
|
// Try stripping the last "::" segment
|
|
if let Some(pos) = current.rfind("::") {
|
|
current.truncate(pos);
|
|
if node_map.contains_key(¤t) {
|
|
return Some(current);
|
|
}
|
|
continue;
|
|
}
|
|
// No more "::" — this is a top-level node (file), no parent
|
|
return None;
|
|
}
|
|
}
|
|
|
|
/// The main graph engine that builds and manages code knowledge graphs
|
|
pub struct GraphEngine {
|
|
parser_registry: ParserRegistry,
|
|
max_nodes: u32,
|
|
}
|
|
|
|
/// In-memory representation of a built code graph
|
|
pub struct CodeGraph {
|
|
pub graph: DiGraph<String, CodeEdgeKind>,
|
|
pub node_map: HashMap<String, NodeIndex>,
|
|
pub nodes: Vec<CodeNode>,
|
|
pub edges: Vec<CodeEdge>,
|
|
}
|
|
|
|
impl GraphEngine {
|
|
pub fn new(max_nodes: u32) -> Self {
|
|
Self {
|
|
parser_registry: ParserRegistry::new(),
|
|
max_nodes,
|
|
}
|
|
}
|
|
|
|
/// Build a code graph from a repository directory
|
|
pub fn build_graph(
|
|
&self,
|
|
repo_path: &Path,
|
|
repo_id: &str,
|
|
graph_build_id: &str,
|
|
) -> Result<(CodeGraph, GraphBuildRun), CoreError> {
|
|
let mut build_run = GraphBuildRun::new(repo_id.to_string());
|
|
|
|
info!(repo_id, path = %repo_path.display(), "Starting graph build");
|
|
|
|
// Phase 1: Parse all files
|
|
let parse_output = self.parser_registry.parse_directory(
|
|
repo_path,
|
|
repo_id,
|
|
graph_build_id,
|
|
self.max_nodes,
|
|
)?;
|
|
|
|
// Phase 2: Build petgraph
|
|
let code_graph = self.build_petgraph(parse_output)?;
|
|
|
|
// Phase 3: Run community detection
|
|
let community_count = detect_communities(&code_graph);
|
|
|
|
// Collect language stats
|
|
let mut languages: Vec<String> = code_graph
|
|
.nodes
|
|
.iter()
|
|
.map(|n| n.language.clone())
|
|
.collect::<std::collections::HashSet<_>>()
|
|
.into_iter()
|
|
.collect();
|
|
languages.sort();
|
|
|
|
build_run.node_count = code_graph.nodes.len() as u32;
|
|
build_run.edge_count = code_graph.edges.len() as u32;
|
|
build_run.community_count = community_count;
|
|
build_run.languages_parsed = languages;
|
|
build_run.status = GraphBuildStatus::Completed;
|
|
build_run.completed_at = Some(Utc::now());
|
|
|
|
info!(
|
|
nodes = build_run.node_count,
|
|
edges = build_run.edge_count,
|
|
communities = build_run.community_count,
|
|
"Graph build complete"
|
|
);
|
|
|
|
Ok((code_graph, build_run))
|
|
}
|
|
|
|
/// Build petgraph from parsed output, resolving edges to node indices.
|
|
///
|
|
/// After resolving the explicit edges from parsers, we synthesise
|
|
/// `Contains` edges so that every node is reachable from its parent
|
|
/// file or module. This eliminates disconnected "islands" that
|
|
/// otherwise appear when files share no direct call/import edges.
|
|
fn build_petgraph(&self, parse_output: ParseOutput) -> Result<CodeGraph, CoreError> {
|
|
let mut graph = DiGraph::new();
|
|
let mut node_map: HashMap<String, NodeIndex> = HashMap::new();
|
|
let mut nodes = parse_output.nodes;
|
|
|
|
// Add all nodes to the graph
|
|
for node in &mut nodes {
|
|
let idx = graph.add_node(node.qualified_name.clone());
|
|
node.graph_index = Some(idx.index() as u32);
|
|
node_map.insert(node.qualified_name.clone(), idx);
|
|
}
|
|
|
|
// Resolve and add explicit edges from parsers
|
|
let mut resolved_edges = Vec::new();
|
|
for mut edge in parse_output.edges {
|
|
let source_idx = node_map.get(&edge.source);
|
|
let resolved = self.resolve_edge_target(&edge.target, &node_map);
|
|
|
|
if let (Some(&src), Some(tgt)) = (source_idx, resolved) {
|
|
let resolved_name = node_map
|
|
.iter()
|
|
.find(|(_, &idx)| idx == tgt)
|
|
.map(|(name, _)| name.clone());
|
|
if let Some(name) = resolved_name {
|
|
edge.target = name;
|
|
}
|
|
graph.add_edge(src, tgt, edge.kind.clone());
|
|
resolved_edges.push(edge);
|
|
}
|
|
}
|
|
|
|
// Synthesise Contains edges: connect each node to its closest
|
|
// parent in the qualified-name hierarchy.
|
|
//
|
|
// For "src/main.rs::config::load", the parent chain is:
|
|
// "src/main.rs::config" → "src/main.rs"
|
|
//
|
|
// We walk up the qualified name (splitting on "::") and link to
|
|
// the first ancestor that exists in the node map.
|
|
let repo_id = nodes.first().map(|n| n.repo_id.as_str()).unwrap_or("");
|
|
let build_id = nodes
|
|
.first()
|
|
.map(|n| n.graph_build_id.as_str())
|
|
.unwrap_or("");
|
|
|
|
let qualified_names: Vec<String> = nodes.iter().map(|n| n.qualified_name.clone()).collect();
|
|
let file_paths: HashMap<String, String> = nodes
|
|
.iter()
|
|
.map(|n| (n.qualified_name.clone(), n.file_path.clone()))
|
|
.collect();
|
|
|
|
for qname in &qualified_names {
|
|
if let Some(parent_qname) = find_parent_qname(qname, &node_map) {
|
|
let child_idx = node_map[qname];
|
|
let parent_idx = node_map[&parent_qname];
|
|
|
|
// Avoid duplicate edges
|
|
if !graph.contains_edge(parent_idx, child_idx) {
|
|
graph.add_edge(parent_idx, child_idx, CodeEdgeKind::Contains);
|
|
resolved_edges.push(CodeEdge {
|
|
id: None,
|
|
repo_id: repo_id.to_string(),
|
|
graph_build_id: build_id.to_string(),
|
|
source: parent_qname,
|
|
target: qname.clone(),
|
|
kind: CodeEdgeKind::Contains,
|
|
file_path: file_paths.get(qname).cloned().unwrap_or_default(),
|
|
line_number: None,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(CodeGraph {
|
|
graph,
|
|
node_map,
|
|
nodes,
|
|
edges: resolved_edges,
|
|
})
|
|
}
|
|
|
|
/// Try to resolve an edge target to a known node.
|
|
///
|
|
/// Resolution strategies (in order):
|
|
/// 1. Direct qualified-name match
|
|
/// 2. Suffix match: "foo" matches "src/main.rs::mod::foo"
|
|
/// 3. Module-path match: "config::load" matches "src/config.rs::load"
|
|
/// 4. Self-method: "self.method" matches "::method"
|
|
fn resolve_edge_target(
|
|
&self,
|
|
target: &str,
|
|
node_map: &HashMap<String, NodeIndex>,
|
|
) -> Option<NodeIndex> {
|
|
// 1. Direct match
|
|
if let Some(idx) = node_map.get(target) {
|
|
return Some(*idx);
|
|
}
|
|
|
|
// 2. Suffix match: "foo" → "path/file.rs::foo"
|
|
let suffix_pattern = format!("::{target}");
|
|
let dot_pattern = format!(".{target}");
|
|
for (qualified, idx) in node_map {
|
|
if qualified.ends_with(&suffix_pattern) || qualified.ends_with(&dot_pattern) {
|
|
return Some(*idx);
|
|
}
|
|
}
|
|
|
|
// 3. Module-path match: "config::load" → try matching the last N
|
|
// segments of the target against node qualified names.
|
|
// This handles cross-file calls like `crate::config::load` or
|
|
// `super::handlers::process` where the prefix differs.
|
|
if target.contains("::") {
|
|
// Strip common Rust path prefixes
|
|
let stripped = target
|
|
.strip_prefix("crate::")
|
|
.or_else(|| target.strip_prefix("super::"))
|
|
.or_else(|| target.strip_prefix("self::"))
|
|
.unwrap_or(target);
|
|
|
|
let segments: Vec<&str> = stripped.split("::").collect();
|
|
// Try matching progressively shorter suffixes
|
|
for start in 0..segments.len() {
|
|
let suffix = segments[start..].join("::");
|
|
let pattern = format!("::{suffix}");
|
|
for (qualified, idx) in node_map {
|
|
if qualified.ends_with(&pattern) {
|
|
return Some(*idx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// 4. Self-method: "self.method" → "::method"
|
|
if let Some(method_name) = target.strip_prefix("self.") {
|
|
let pattern = format!("::{method_name}");
|
|
for (qualified, idx) in node_map {
|
|
if qualified.ends_with(&pattern) {
|
|
return Some(*idx);
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Get the impact analyzer for a built graph
|
|
pub fn impact_analyzer(code_graph: &CodeGraph) -> ImpactAnalyzer<'_> {
|
|
ImpactAnalyzer::new(code_graph)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use compliance_core::models::graph::{CodeEdgeKind, CodeNode, CodeNodeKind};
|
|
|
|
fn make_node(qualified_name: &str) -> CodeNode {
|
|
CodeNode {
|
|
id: None,
|
|
repo_id: "test".to_string(),
|
|
graph_build_id: "build1".to_string(),
|
|
qualified_name: qualified_name.to_string(),
|
|
name: qualified_name
|
|
.split("::")
|
|
.last()
|
|
.unwrap_or(qualified_name)
|
|
.to_string(),
|
|
kind: CodeNodeKind::Function,
|
|
file_path: "src/main.rs".to_string(),
|
|
start_line: 1,
|
|
end_line: 10,
|
|
language: "rust".to_string(),
|
|
community_id: None,
|
|
is_entry_point: false,
|
|
graph_index: None,
|
|
}
|
|
}
|
|
|
|
fn build_test_node_map(names: &[&str]) -> HashMap<String, NodeIndex> {
|
|
let mut graph: DiGraph<String, String> = DiGraph::new();
|
|
let mut map = HashMap::new();
|
|
for name in names {
|
|
let idx = graph.add_node(name.to_string());
|
|
map.insert(name.to_string(), idx);
|
|
}
|
|
map
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_edge_target_direct_match() {
|
|
let engine = GraphEngine::new(1000);
|
|
let node_map = build_test_node_map(&["src/main.rs::foo", "src/main.rs::bar"]);
|
|
let result = engine.resolve_edge_target("src/main.rs::foo", &node_map);
|
|
assert!(result.is_some());
|
|
assert_eq!(result.unwrap(), node_map["src/main.rs::foo"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_edge_target_short_name_match() {
|
|
let engine = GraphEngine::new(1000);
|
|
let node_map = build_test_node_map(&["src/main.rs::foo", "src/main.rs::bar"]);
|
|
let result = engine.resolve_edge_target("foo", &node_map);
|
|
assert!(result.is_some());
|
|
assert_eq!(result.unwrap(), node_map["src/main.rs::foo"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_edge_target_method_match() {
|
|
let engine = GraphEngine::new(1000);
|
|
let node_map = build_test_node_map(&["src/main.rs::MyStruct::do_thing"]);
|
|
let result = engine.resolve_edge_target("do_thing", &node_map);
|
|
assert!(result.is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_edge_target_self_method() {
|
|
let engine = GraphEngine::new(1000);
|
|
let node_map = build_test_node_map(&["src/main.rs::MyStruct::process"]);
|
|
let result = engine.resolve_edge_target("self.process", &node_map);
|
|
assert!(result.is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_edge_target_no_match() {
|
|
let engine = GraphEngine::new(1000);
|
|
let node_map = build_test_node_map(&["src/main.rs::foo"]);
|
|
let result = engine.resolve_edge_target("nonexistent", &node_map);
|
|
assert!(result.is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_edge_target_empty_map() {
|
|
let engine = GraphEngine::new(1000);
|
|
let node_map = HashMap::new();
|
|
let result = engine.resolve_edge_target("anything", &node_map);
|
|
assert!(result.is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_edge_target_dot_notation() {
|
|
let engine = GraphEngine::new(1000);
|
|
let node_map = build_test_node_map(&["src/app.js.handler"]);
|
|
let result = engine.resolve_edge_target("handler", &node_map);
|
|
assert!(result.is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_petgraph_empty() {
|
|
let engine = GraphEngine::new(1000);
|
|
let output = ParseOutput::default();
|
|
let code_graph = engine.build_petgraph(output).unwrap();
|
|
assert_eq!(code_graph.nodes.len(), 0);
|
|
assert_eq!(code_graph.edges.len(), 0);
|
|
assert_eq!(code_graph.graph.node_count(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_petgraph_nodes_get_graph_index() {
|
|
let engine = GraphEngine::new(1000);
|
|
let mut output = ParseOutput::default();
|
|
output.nodes.push(make_node("src/main.rs::foo"));
|
|
output.nodes.push(make_node("src/main.rs::bar"));
|
|
|
|
let code_graph = engine.build_petgraph(output).unwrap();
|
|
assert_eq!(code_graph.nodes.len(), 2);
|
|
assert_eq!(code_graph.graph.node_count(), 2);
|
|
// All nodes should have a graph_index assigned
|
|
for node in &code_graph.nodes {
|
|
assert!(node.graph_index.is_some());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_petgraph_resolves_edges() {
|
|
let engine = GraphEngine::new(1000);
|
|
let mut output = ParseOutput::default();
|
|
output.nodes.push(make_node("src/main.rs::foo"));
|
|
output.nodes.push(make_node("src/main.rs::bar"));
|
|
output.edges.push(CodeEdge {
|
|
id: None,
|
|
repo_id: "test".to_string(),
|
|
graph_build_id: "build1".to_string(),
|
|
source: "src/main.rs::foo".to_string(),
|
|
target: "bar".to_string(), // short name, should resolve
|
|
kind: CodeEdgeKind::Calls,
|
|
file_path: "src/main.rs".to_string(),
|
|
line_number: Some(5),
|
|
});
|
|
|
|
let code_graph = engine.build_petgraph(output).unwrap();
|
|
assert_eq!(code_graph.edges.len(), 1);
|
|
assert_eq!(code_graph.graph.edge_count(), 1);
|
|
// The resolved edge target should be the full qualified name
|
|
assert_eq!(code_graph.edges[0].target, "src/main.rs::bar");
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_petgraph_skips_unresolved_edges() {
|
|
let engine = GraphEngine::new(1000);
|
|
let mut output = ParseOutput::default();
|
|
output.nodes.push(make_node("src/main.rs::foo"));
|
|
output.edges.push(CodeEdge {
|
|
id: None,
|
|
repo_id: "test".to_string(),
|
|
graph_build_id: "build1".to_string(),
|
|
source: "src/main.rs::foo".to_string(),
|
|
target: "external_crate::something".to_string(),
|
|
kind: CodeEdgeKind::Calls,
|
|
file_path: "src/main.rs".to_string(),
|
|
line_number: Some(5),
|
|
});
|
|
|
|
let code_graph = engine.build_petgraph(output).unwrap();
|
|
assert_eq!(code_graph.edges.len(), 0);
|
|
assert_eq!(code_graph.graph.edge_count(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_code_graph_node_map_consistency() {
|
|
let engine = GraphEngine::new(1000);
|
|
let mut output = ParseOutput::default();
|
|
output.nodes.push(make_node("a::b"));
|
|
output.nodes.push(make_node("a::c"));
|
|
output.nodes.push(make_node("a::d"));
|
|
|
|
let code_graph = engine.build_petgraph(output).unwrap();
|
|
assert_eq!(code_graph.node_map.len(), 3);
|
|
assert!(code_graph.node_map.contains_key("a::b"));
|
|
assert!(code_graph.node_map.contains_key("a::c"));
|
|
assert!(code_graph.node_map.contains_key("a::d"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_contains_edges_synthesised() {
|
|
let engine = GraphEngine::new(1000);
|
|
let mut output = ParseOutput::default();
|
|
// File → Module → Function hierarchy
|
|
output.nodes.push(make_node("src/main.rs"));
|
|
output.nodes.push(make_node("src/main.rs::config"));
|
|
output.nodes.push(make_node("src/main.rs::config::load"));
|
|
|
|
let code_graph = engine.build_petgraph(output).unwrap();
|
|
|
|
// Should have 2 Contains edges:
|
|
// src/main.rs → src/main.rs::config
|
|
// src/main.rs::config → src/main.rs::config::load
|
|
let contains_edges: Vec<_> = code_graph
|
|
.edges
|
|
.iter()
|
|
.filter(|e| matches!(e.kind, CodeEdgeKind::Contains))
|
|
.collect();
|
|
assert_eq!(contains_edges.len(), 2, "expected 2 Contains edges");
|
|
|
|
let sources: Vec<&str> = contains_edges.iter().map(|e| e.source.as_str()).collect();
|
|
assert!(sources.contains(&"src/main.rs"));
|
|
assert!(sources.contains(&"src/main.rs::config"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_contains_edges_no_duplicates_with_existing_edges() {
|
|
let engine = GraphEngine::new(1000);
|
|
let mut output = ParseOutput::default();
|
|
output.nodes.push(make_node("src/main.rs"));
|
|
output.nodes.push(make_node("src/main.rs::foo"));
|
|
|
|
// Explicit Calls edge (foo calls itself? just for testing)
|
|
output.edges.push(CodeEdge {
|
|
id: None,
|
|
repo_id: "test".to_string(),
|
|
graph_build_id: "build1".to_string(),
|
|
source: "src/main.rs::foo".to_string(),
|
|
target: "src/main.rs::foo".to_string(),
|
|
kind: CodeEdgeKind::Calls,
|
|
file_path: "src/main.rs".to_string(),
|
|
line_number: Some(1),
|
|
});
|
|
|
|
let code_graph = engine.build_petgraph(output).unwrap();
|
|
|
|
// 1 Calls + 1 Contains = 2 edges total
|
|
assert_eq!(code_graph.edges.len(), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_cross_file_resolution_with_module_path() {
|
|
let engine = GraphEngine::new(1000);
|
|
let node_map = build_test_node_map(&["src/config.rs::load_config", "src/main.rs::main"]);
|
|
// "crate::config::load_config" should resolve to "src/config.rs::load_config"
|
|
let result = engine.resolve_edge_target("crate::config::load_config", &node_map);
|
|
assert!(result.is_some(), "cross-file crate:: path should resolve");
|
|
}
|
|
|
|
#[test]
|
|
fn test_find_parent_qname() {
|
|
let node_map = build_test_node_map(&[
|
|
"src/main.rs",
|
|
"src/main.rs::config",
|
|
"src/main.rs::config::load",
|
|
]);
|
|
|
|
assert_eq!(
|
|
find_parent_qname("src/main.rs::config::load", &node_map),
|
|
Some("src/main.rs::config".to_string())
|
|
);
|
|
assert_eq!(
|
|
find_parent_qname("src/main.rs::config", &node_map),
|
|
Some("src/main.rs".to_string())
|
|
);
|
|
assert_eq!(find_parent_qname("src/main.rs", &node_map), None);
|
|
}
|
|
}
|