Files
compliance-scanner-agent/compliance-graph/src/search/index.rs
Sharang Parnerkar 3bb690e5bb
All checks were successful
CI / Format (push) Successful in 4s
CI / Clippy (push) Successful in 4m19s
CI / Security Audit (push) Successful in 1m44s
CI / Detect Changes (push) Successful in 5s
CI / Tests (push) Successful in 5m15s
CI / Deploy Agent (push) Successful in 2s
CI / Deploy Dashboard (push) Successful in 2s
CI / Deploy Docs (push) Has been skipped
CI / Deploy MCP (push) Successful in 2s
refactor: modularize codebase and add 404 unit tests (#13)
2026-03-13 08:03:45 +00:00

314 lines
9.5 KiB
Rust

use compliance_core::error::CoreError;
use compliance_core::models::graph::CodeNode;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, Value, STORED, TEXT};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tracing::info;
/// BM25 text search index over code symbols
pub struct SymbolIndex {
index: Index,
#[allow(dead_code)]
schema: Schema,
qualified_name_field: tantivy::schema::Field,
name_field: tantivy::schema::Field,
kind_field: tantivy::schema::Field,
file_path_field: tantivy::schema::Field,
language_field: tantivy::schema::Field,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct SearchResult {
pub qualified_name: String,
pub name: String,
pub kind: String,
pub file_path: String,
pub language: String,
pub score: f32,
}
impl SymbolIndex {
/// Create a new in-memory symbol index
pub fn new() -> Result<Self, CoreError> {
let mut schema_builder = Schema::builder();
let qualified_name_field = schema_builder.add_text_field("qualified_name", TEXT | STORED);
let name_field = schema_builder.add_text_field("name", TEXT | STORED);
let kind_field = schema_builder.add_text_field("kind", TEXT | STORED);
let file_path_field = schema_builder.add_text_field("file_path", TEXT | STORED);
let language_field = schema_builder.add_text_field("language", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
Ok(Self {
index,
schema,
qualified_name_field,
name_field,
kind_field,
file_path_field,
language_field,
})
}
/// Index a set of code nodes
pub fn index_nodes(&self, nodes: &[CodeNode]) -> Result<(), CoreError> {
let mut writer: IndexWriter = self
.index
.writer(50_000_000)
.map_err(|e| CoreError::Graph(format!("Failed to create index writer: {e}")))?;
for node in nodes {
writer
.add_document(doc!(
self.qualified_name_field => node.qualified_name.as_str(),
self.name_field => node.name.as_str(),
self.kind_field => node.kind.to_string(),
self.file_path_field => node.file_path.as_str(),
self.language_field => node.language.as_str(),
))
.map_err(|e| CoreError::Graph(format!("Failed to add document: {e}")))?;
}
writer
.commit()
.map_err(|e| CoreError::Graph(format!("Failed to commit index: {e}")))?;
info!(nodes = nodes.len(), "Symbol index built");
Ok(())
}
/// Search for symbols matching a query
pub fn search(&self, query_str: &str, limit: usize) -> Result<Vec<SearchResult>, CoreError> {
let reader = self
.index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.map_err(|e| CoreError::Graph(format!("Failed to create reader: {e}")))?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(
&self.index,
vec![self.name_field, self.qualified_name_field],
);
let query = query_parser
.parse_query(query_str)
.map_err(|e| CoreError::Graph(format!("Failed to parse query: {e}")))?;
let top_docs = searcher
.search(&query, &TopDocs::with_limit(limit))
.map_err(|e| CoreError::Graph(format!("Search failed: {e}")))?;
let mut results = Vec::new();
for (score, doc_address) in top_docs {
let doc: tantivy::TantivyDocument = searcher
.doc(doc_address)
.map_err(|e| CoreError::Graph(format!("Failed to retrieve doc: {e}")))?;
let get_field = |field: tantivy::schema::Field| -> String {
doc.get_first(field)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string()
};
results.push(SearchResult {
qualified_name: get_field(self.qualified_name_field),
name: get_field(self.name_field),
kind: get_field(self.kind_field),
file_path: get_field(self.file_path_field),
language: get_field(self.language_field),
score,
});
}
Ok(results)
}
}
#[cfg(test)]
mod tests {
use super::*;
use compliance_core::models::graph::CodeNodeKind;
fn make_node(
qualified_name: &str,
name: &str,
kind: CodeNodeKind,
file_path: &str,
language: &str,
) -> CodeNode {
CodeNode {
id: None,
repo_id: "test".to_string(),
graph_build_id: "build1".to_string(),
qualified_name: qualified_name.to_string(),
name: name.to_string(),
kind,
file_path: file_path.to_string(),
start_line: 1,
end_line: 10,
language: language.to_string(),
community_id: None,
is_entry_point: false,
graph_index: None,
}
}
#[test]
fn test_new_creates_index() {
let index = SymbolIndex::new();
assert!(index.is_ok());
}
#[test]
fn test_index_empty_nodes() {
let index = SymbolIndex::new().unwrap();
let result = index.index_nodes(&[]);
assert!(result.is_ok());
}
#[test]
fn test_index_and_search_single_node() {
let index = SymbolIndex::new().unwrap();
let nodes = vec![make_node(
"src/main.rs::main",
"main",
CodeNodeKind::Function,
"src/main.rs",
"rust",
)];
index.index_nodes(&nodes).unwrap();
let results = index.search("main", 10).unwrap();
assert!(!results.is_empty());
assert_eq!(results[0].name, "main");
assert_eq!(results[0].qualified_name, "src/main.rs::main");
}
#[test]
fn test_search_no_results() {
let index = SymbolIndex::new().unwrap();
let nodes = vec![make_node(
"src/main.rs::foo",
"foo",
CodeNodeKind::Function,
"src/main.rs",
"rust",
)];
index.index_nodes(&nodes).unwrap();
let results = index.search("zzzznonexistent", 10).unwrap();
assert!(results.is_empty());
}
#[test]
fn test_search_multiple_nodes() {
let index = SymbolIndex::new().unwrap();
let nodes = vec![
make_node(
"a.rs::handle_request",
"handle_request",
CodeNodeKind::Function,
"a.rs",
"rust",
),
make_node(
"b.rs::handle_response",
"handle_response",
CodeNodeKind::Function,
"b.rs",
"rust",
),
make_node(
"c.rs::process_data",
"process_data",
CodeNodeKind::Function,
"c.rs",
"rust",
),
];
index.index_nodes(&nodes).unwrap();
let results = index.search("handle", 10).unwrap();
assert!(results.len() >= 2);
}
#[test]
fn test_search_limit() {
let index = SymbolIndex::new().unwrap();
let mut nodes = Vec::new();
for i in 0..20 {
nodes.push(make_node(
&format!("mod::func_{i}"),
&format!("func_{i}"),
CodeNodeKind::Function,
"mod.rs",
"rust",
));
}
index.index_nodes(&nodes).unwrap();
let results = index.search("func", 5).unwrap();
assert!(results.len() <= 5);
}
#[test]
fn test_search_result_has_score() {
let index = SymbolIndex::new().unwrap();
let nodes = vec![make_node(
"src/lib.rs::compute",
"compute",
CodeNodeKind::Function,
"src/lib.rs",
"rust",
)];
index.index_nodes(&nodes).unwrap();
let results = index.search("compute", 10).unwrap();
assert!(!results.is_empty());
assert!(results[0].score > 0.0);
}
#[test]
fn test_search_result_fields() {
let index = SymbolIndex::new().unwrap();
let nodes = vec![make_node(
"src/app.py::MyClass",
"MyClass",
CodeNodeKind::Class,
"src/app.py",
"python",
)];
index.index_nodes(&nodes).unwrap();
let results = index.search("MyClass", 10).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].name, "MyClass");
assert_eq!(results[0].kind, "class");
assert_eq!(results[0].file_path, "src/app.py");
assert_eq!(results[0].language, "python");
}
#[test]
fn test_search_empty_query() {
let index = SymbolIndex::new().unwrap();
let nodes = vec![make_node(
"src/lib.rs::foo",
"foo",
CodeNodeKind::Function,
"src/lib.rs",
"rust",
)];
index.index_nodes(&nodes).unwrap();
// Empty query may parse error or return empty - both acceptable
let result = index.search("", 10);
// Just verify it doesn't panic
let _ = result;
}
}