Files
compliance-scanner-agent/compliance-graph/src/graph/engine.rs
T
sharang 4388e98b5b
CI / Check (push) Has been skipped
CI / Detect Changes (push) Successful in 2s
CI / Deploy Agent (push) Successful in 2s
CI / Deploy Dashboard (push) Successful in 2s
CI / Deploy Docs (push) Has been skipped
CI / Deploy MCP (push) Has been skipped
feat: add E2E test suite with nightly CI, fix dashboard Dockerfile (#52)
2026-03-30 10:04:07 +00:00

533 lines
19 KiB
Rust

use std::collections::HashMap;
use std::path::Path;
use chrono::Utc;
use compliance_core::error::CoreError;
use compliance_core::models::graph::{
CodeEdge, CodeEdgeKind, CodeNode, GraphBuildRun, GraphBuildStatus,
};
use compliance_core::traits::graph_builder::ParseOutput;
use petgraph::graph::{DiGraph, NodeIndex};
use tracing::info;
use crate::parsers::registry::ParserRegistry;
use super::community::detect_communities;
use super::impact::ImpactAnalyzer;
/// Walk up the qualified-name hierarchy to find the closest ancestor
/// that exists in the node map.
///
/// For `"src/main.rs::config::load"` this tries:
/// 1. `"src/main.rs::config"` (trim last `::` segment)
/// 2. `"src/main.rs"` (trim again)
///
/// Returns the first match found, or `None` if the node is a root.
fn find_parent_qname(qname: &str, node_map: &HashMap<String, NodeIndex>) -> Option<String> {
let mut current = qname.to_string();
loop {
// Try stripping the last "::" segment
if let Some(pos) = current.rfind("::") {
current.truncate(pos);
if node_map.contains_key(&current) {
return Some(current);
}
continue;
}
// No more "::" — this is a top-level node (file), no parent
return None;
}
}
/// The main graph engine that builds and manages code knowledge graphs
pub struct GraphEngine {
parser_registry: ParserRegistry,
max_nodes: u32,
}
/// In-memory representation of a built code graph
pub struct CodeGraph {
pub graph: DiGraph<String, CodeEdgeKind>,
pub node_map: HashMap<String, NodeIndex>,
pub nodes: Vec<CodeNode>,
pub edges: Vec<CodeEdge>,
}
impl GraphEngine {
pub fn new(max_nodes: u32) -> Self {
Self {
parser_registry: ParserRegistry::new(),
max_nodes,
}
}
/// Build a code graph from a repository directory
pub fn build_graph(
&self,
repo_path: &Path,
repo_id: &str,
graph_build_id: &str,
) -> Result<(CodeGraph, GraphBuildRun), CoreError> {
let mut build_run = GraphBuildRun::new(repo_id.to_string());
info!(repo_id, path = %repo_path.display(), "Starting graph build");
// Phase 1: Parse all files
let parse_output = self.parser_registry.parse_directory(
repo_path,
repo_id,
graph_build_id,
self.max_nodes,
)?;
// Phase 2: Build petgraph
let code_graph = self.build_petgraph(parse_output)?;
// Phase 3: Run community detection
let community_count = detect_communities(&code_graph);
// Collect language stats
let mut languages: Vec<String> = code_graph
.nodes
.iter()
.map(|n| n.language.clone())
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
languages.sort();
build_run.node_count = code_graph.nodes.len() as u32;
build_run.edge_count = code_graph.edges.len() as u32;
build_run.community_count = community_count;
build_run.languages_parsed = languages;
build_run.status = GraphBuildStatus::Completed;
build_run.completed_at = Some(Utc::now());
info!(
nodes = build_run.node_count,
edges = build_run.edge_count,
communities = build_run.community_count,
"Graph build complete"
);
Ok((code_graph, build_run))
}
/// Build petgraph from parsed output, resolving edges to node indices.
///
/// After resolving the explicit edges from parsers, we synthesise
/// `Contains` edges so that every node is reachable from its parent
/// file or module. This eliminates disconnected "islands" that
/// otherwise appear when files share no direct call/import edges.
fn build_petgraph(&self, parse_output: ParseOutput) -> Result<CodeGraph, CoreError> {
let mut graph = DiGraph::new();
let mut node_map: HashMap<String, NodeIndex> = HashMap::new();
let mut nodes = parse_output.nodes;
// Add all nodes to the graph
for node in &mut nodes {
let idx = graph.add_node(node.qualified_name.clone());
node.graph_index = Some(idx.index() as u32);
node_map.insert(node.qualified_name.clone(), idx);
}
// Resolve and add explicit edges from parsers
let mut resolved_edges = Vec::new();
for mut edge in parse_output.edges {
let source_idx = node_map.get(&edge.source);
let resolved = self.resolve_edge_target(&edge.target, &node_map);
if let (Some(&src), Some(tgt)) = (source_idx, resolved) {
let resolved_name = node_map
.iter()
.find(|(_, &idx)| idx == tgt)
.map(|(name, _)| name.clone());
if let Some(name) = resolved_name {
edge.target = name;
}
graph.add_edge(src, tgt, edge.kind.clone());
resolved_edges.push(edge);
}
}
// Synthesise Contains edges: connect each node to its closest
// parent in the qualified-name hierarchy.
//
// For "src/main.rs::config::load", the parent chain is:
// "src/main.rs::config" → "src/main.rs"
//
// We walk up the qualified name (splitting on "::") and link to
// the first ancestor that exists in the node map.
let repo_id = nodes.first().map(|n| n.repo_id.as_str()).unwrap_or("");
let build_id = nodes
.first()
.map(|n| n.graph_build_id.as_str())
.unwrap_or("");
let qualified_names: Vec<String> = nodes.iter().map(|n| n.qualified_name.clone()).collect();
let file_paths: HashMap<String, String> = nodes
.iter()
.map(|n| (n.qualified_name.clone(), n.file_path.clone()))
.collect();
for qname in &qualified_names {
if let Some(parent_qname) = find_parent_qname(qname, &node_map) {
let child_idx = node_map[qname];
let parent_idx = node_map[&parent_qname];
// Avoid duplicate edges
if !graph.contains_edge(parent_idx, child_idx) {
graph.add_edge(parent_idx, child_idx, CodeEdgeKind::Contains);
resolved_edges.push(CodeEdge {
id: None,
repo_id: repo_id.to_string(),
graph_build_id: build_id.to_string(),
source: parent_qname,
target: qname.clone(),
kind: CodeEdgeKind::Contains,
file_path: file_paths.get(qname).cloned().unwrap_or_default(),
line_number: None,
});
}
}
}
Ok(CodeGraph {
graph,
node_map,
nodes,
edges: resolved_edges,
})
}
/// Try to resolve an edge target to a known node.
///
/// Resolution strategies (in order):
/// 1. Direct qualified-name match
/// 2. Suffix match: "foo" matches "src/main.rs::mod::foo"
/// 3. Module-path match: "config::load" matches "src/config.rs::load"
/// 4. Self-method: "self.method" matches "::method"
fn resolve_edge_target(
&self,
target: &str,
node_map: &HashMap<String, NodeIndex>,
) -> Option<NodeIndex> {
// 1. Direct match
if let Some(idx) = node_map.get(target) {
return Some(*idx);
}
// 2. Suffix match: "foo" → "path/file.rs::foo"
let suffix_pattern = format!("::{target}");
let dot_pattern = format!(".{target}");
for (qualified, idx) in node_map {
if qualified.ends_with(&suffix_pattern) || qualified.ends_with(&dot_pattern) {
return Some(*idx);
}
}
// 3. Module-path match: "config::load" → try matching the last N
// segments of the target against node qualified names.
// This handles cross-file calls like `crate::config::load` or
// `super::handlers::process` where the prefix differs.
if target.contains("::") {
// Strip common Rust path prefixes
let stripped = target
.strip_prefix("crate::")
.or_else(|| target.strip_prefix("super::"))
.or_else(|| target.strip_prefix("self::"))
.unwrap_or(target);
let segments: Vec<&str> = stripped.split("::").collect();
// Try matching progressively shorter suffixes
for start in 0..segments.len() {
let suffix = segments[start..].join("::");
let pattern = format!("::{suffix}");
for (qualified, idx) in node_map {
if qualified.ends_with(&pattern) {
return Some(*idx);
}
}
}
}
// 4. Self-method: "self.method" → "::method"
if let Some(method_name) = target.strip_prefix("self.") {
let pattern = format!("::{method_name}");
for (qualified, idx) in node_map {
if qualified.ends_with(&pattern) {
return Some(*idx);
}
}
}
None
}
/// Get the impact analyzer for a built graph
pub fn impact_analyzer(code_graph: &CodeGraph) -> ImpactAnalyzer<'_> {
ImpactAnalyzer::new(code_graph)
}
}
#[cfg(test)]
mod tests {
use super::*;
use compliance_core::models::graph::{CodeEdgeKind, CodeNode, CodeNodeKind};
fn make_node(qualified_name: &str) -> CodeNode {
CodeNode {
id: None,
repo_id: "test".to_string(),
graph_build_id: "build1".to_string(),
qualified_name: qualified_name.to_string(),
name: qualified_name
.split("::")
.last()
.unwrap_or(qualified_name)
.to_string(),
kind: CodeNodeKind::Function,
file_path: "src/main.rs".to_string(),
start_line: 1,
end_line: 10,
language: "rust".to_string(),
community_id: None,
is_entry_point: false,
graph_index: None,
}
}
fn build_test_node_map(names: &[&str]) -> HashMap<String, NodeIndex> {
let mut graph: DiGraph<String, String> = DiGraph::new();
let mut map = HashMap::new();
for name in names {
let idx = graph.add_node(name.to_string());
map.insert(name.to_string(), idx);
}
map
}
#[test]
fn test_resolve_edge_target_direct_match() {
let engine = GraphEngine::new(1000);
let node_map = build_test_node_map(&["src/main.rs::foo", "src/main.rs::bar"]);
let result = engine.resolve_edge_target("src/main.rs::foo", &node_map);
assert!(result.is_some());
assert_eq!(result.unwrap(), node_map["src/main.rs::foo"]);
}
#[test]
fn test_resolve_edge_target_short_name_match() {
let engine = GraphEngine::new(1000);
let node_map = build_test_node_map(&["src/main.rs::foo", "src/main.rs::bar"]);
let result = engine.resolve_edge_target("foo", &node_map);
assert!(result.is_some());
assert_eq!(result.unwrap(), node_map["src/main.rs::foo"]);
}
#[test]
fn test_resolve_edge_target_method_match() {
let engine = GraphEngine::new(1000);
let node_map = build_test_node_map(&["src/main.rs::MyStruct::do_thing"]);
let result = engine.resolve_edge_target("do_thing", &node_map);
assert!(result.is_some());
}
#[test]
fn test_resolve_edge_target_self_method() {
let engine = GraphEngine::new(1000);
let node_map = build_test_node_map(&["src/main.rs::MyStruct::process"]);
let result = engine.resolve_edge_target("self.process", &node_map);
assert!(result.is_some());
}
#[test]
fn test_resolve_edge_target_no_match() {
let engine = GraphEngine::new(1000);
let node_map = build_test_node_map(&["src/main.rs::foo"]);
let result = engine.resolve_edge_target("nonexistent", &node_map);
assert!(result.is_none());
}
#[test]
fn test_resolve_edge_target_empty_map() {
let engine = GraphEngine::new(1000);
let node_map = HashMap::new();
let result = engine.resolve_edge_target("anything", &node_map);
assert!(result.is_none());
}
#[test]
fn test_resolve_edge_target_dot_notation() {
let engine = GraphEngine::new(1000);
let node_map = build_test_node_map(&["src/app.js.handler"]);
let result = engine.resolve_edge_target("handler", &node_map);
assert!(result.is_some());
}
#[test]
fn test_build_petgraph_empty() {
let engine = GraphEngine::new(1000);
let output = ParseOutput::default();
let code_graph = engine.build_petgraph(output).unwrap();
assert_eq!(code_graph.nodes.len(), 0);
assert_eq!(code_graph.edges.len(), 0);
assert_eq!(code_graph.graph.node_count(), 0);
}
#[test]
fn test_build_petgraph_nodes_get_graph_index() {
let engine = GraphEngine::new(1000);
let mut output = ParseOutput::default();
output.nodes.push(make_node("src/main.rs::foo"));
output.nodes.push(make_node("src/main.rs::bar"));
let code_graph = engine.build_petgraph(output).unwrap();
assert_eq!(code_graph.nodes.len(), 2);
assert_eq!(code_graph.graph.node_count(), 2);
// All nodes should have a graph_index assigned
for node in &code_graph.nodes {
assert!(node.graph_index.is_some());
}
}
#[test]
fn test_build_petgraph_resolves_edges() {
let engine = GraphEngine::new(1000);
let mut output = ParseOutput::default();
output.nodes.push(make_node("src/main.rs::foo"));
output.nodes.push(make_node("src/main.rs::bar"));
output.edges.push(CodeEdge {
id: None,
repo_id: "test".to_string(),
graph_build_id: "build1".to_string(),
source: "src/main.rs::foo".to_string(),
target: "bar".to_string(), // short name, should resolve
kind: CodeEdgeKind::Calls,
file_path: "src/main.rs".to_string(),
line_number: Some(5),
});
let code_graph = engine.build_petgraph(output).unwrap();
assert_eq!(code_graph.edges.len(), 1);
assert_eq!(code_graph.graph.edge_count(), 1);
// The resolved edge target should be the full qualified name
assert_eq!(code_graph.edges[0].target, "src/main.rs::bar");
}
#[test]
fn test_build_petgraph_skips_unresolved_edges() {
let engine = GraphEngine::new(1000);
let mut output = ParseOutput::default();
output.nodes.push(make_node("src/main.rs::foo"));
output.edges.push(CodeEdge {
id: None,
repo_id: "test".to_string(),
graph_build_id: "build1".to_string(),
source: "src/main.rs::foo".to_string(),
target: "external_crate::something".to_string(),
kind: CodeEdgeKind::Calls,
file_path: "src/main.rs".to_string(),
line_number: Some(5),
});
let code_graph = engine.build_petgraph(output).unwrap();
assert_eq!(code_graph.edges.len(), 0);
assert_eq!(code_graph.graph.edge_count(), 0);
}
#[test]
fn test_code_graph_node_map_consistency() {
let engine = GraphEngine::new(1000);
let mut output = ParseOutput::default();
output.nodes.push(make_node("a::b"));
output.nodes.push(make_node("a::c"));
output.nodes.push(make_node("a::d"));
let code_graph = engine.build_petgraph(output).unwrap();
assert_eq!(code_graph.node_map.len(), 3);
assert!(code_graph.node_map.contains_key("a::b"));
assert!(code_graph.node_map.contains_key("a::c"));
assert!(code_graph.node_map.contains_key("a::d"));
}
#[test]
fn test_contains_edges_synthesised() {
let engine = GraphEngine::new(1000);
let mut output = ParseOutput::default();
// File → Module → Function hierarchy
output.nodes.push(make_node("src/main.rs"));
output.nodes.push(make_node("src/main.rs::config"));
output.nodes.push(make_node("src/main.rs::config::load"));
let code_graph = engine.build_petgraph(output).unwrap();
// Should have 2 Contains edges:
// src/main.rs → src/main.rs::config
// src/main.rs::config → src/main.rs::config::load
let contains_edges: Vec<_> = code_graph
.edges
.iter()
.filter(|e| matches!(e.kind, CodeEdgeKind::Contains))
.collect();
assert_eq!(contains_edges.len(), 2, "expected 2 Contains edges");
let sources: Vec<&str> = contains_edges.iter().map(|e| e.source.as_str()).collect();
assert!(sources.contains(&"src/main.rs"));
assert!(sources.contains(&"src/main.rs::config"));
}
#[test]
fn test_contains_edges_no_duplicates_with_existing_edges() {
let engine = GraphEngine::new(1000);
let mut output = ParseOutput::default();
output.nodes.push(make_node("src/main.rs"));
output.nodes.push(make_node("src/main.rs::foo"));
// Explicit Calls edge (foo calls itself? just for testing)
output.edges.push(CodeEdge {
id: None,
repo_id: "test".to_string(),
graph_build_id: "build1".to_string(),
source: "src/main.rs::foo".to_string(),
target: "src/main.rs::foo".to_string(),
kind: CodeEdgeKind::Calls,
file_path: "src/main.rs".to_string(),
line_number: Some(1),
});
let code_graph = engine.build_petgraph(output).unwrap();
// 1 Calls + 1 Contains = 2 edges total
assert_eq!(code_graph.edges.len(), 2);
}
#[test]
fn test_cross_file_resolution_with_module_path() {
let engine = GraphEngine::new(1000);
let node_map = build_test_node_map(&["src/config.rs::load_config", "src/main.rs::main"]);
// "crate::config::load_config" should resolve to "src/config.rs::load_config"
let result = engine.resolve_edge_target("crate::config::load_config", &node_map);
assert!(result.is_some(), "cross-file crate:: path should resolve");
}
#[test]
fn test_find_parent_qname() {
let node_map = build_test_node_map(&[
"src/main.rs",
"src/main.rs::config",
"src/main.rs::config::load",
]);
assert_eq!(
find_parent_qname("src/main.rs::config::load", &node_map),
Some("src/main.rs::config".to_string())
);
assert_eq!(
find_parent_qname("src/main.rs::config", &node_map),
Some("src/main.rs".to_string())
);
assert_eq!(find_parent_qname("src/main.rs", &node_map), None);
}
}