use compliance_core::error::CoreError; use compliance_core::models::graph::CodeNode; use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::{Schema, Value, STORED, TEXT}; use tantivy::{doc, Index, IndexWriter, ReloadPolicy}; use tracing::info; /// BM25 text search index over code symbols pub struct SymbolIndex { index: Index, #[allow(dead_code)] schema: Schema, qualified_name_field: tantivy::schema::Field, name_field: tantivy::schema::Field, kind_field: tantivy::schema::Field, file_path_field: tantivy::schema::Field, language_field: tantivy::schema::Field, } #[derive(Debug, Clone, serde::Serialize)] pub struct SearchResult { pub qualified_name: String, pub name: String, pub kind: String, pub file_path: String, pub language: String, pub score: f32, } impl SymbolIndex { /// Create a new in-memory symbol index pub fn new() -> Result { let mut schema_builder = Schema::builder(); let qualified_name_field = schema_builder.add_text_field("qualified_name", TEXT | STORED); let name_field = schema_builder.add_text_field("name", TEXT | STORED); let kind_field = schema_builder.add_text_field("kind", TEXT | STORED); let file_path_field = schema_builder.add_text_field("file_path", TEXT | STORED); let language_field = schema_builder.add_text_field("language", TEXT | STORED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); Ok(Self { index, schema, qualified_name_field, name_field, kind_field, file_path_field, language_field, }) } /// Index a set of code nodes pub fn index_nodes(&self, nodes: &[CodeNode]) -> Result<(), CoreError> { let mut writer: IndexWriter = self .index .writer(50_000_000) .map_err(|e| CoreError::Graph(format!("Failed to create index writer: {e}")))?; for node in nodes { writer .add_document(doc!( self.qualified_name_field => node.qualified_name.as_str(), self.name_field => node.name.as_str(), self.kind_field => node.kind.to_string(), self.file_path_field => node.file_path.as_str(), self.language_field => node.language.as_str(), )) .map_err(|e| CoreError::Graph(format!("Failed to add document: {e}")))?; } writer .commit() .map_err(|e| CoreError::Graph(format!("Failed to commit index: {e}")))?; info!(nodes = nodes.len(), "Symbol index built"); Ok(()) } /// Search for symbols matching a query pub fn search(&self, query_str: &str, limit: usize) -> Result, CoreError> { let reader = self .index .reader_builder() .reload_policy(ReloadPolicy::Manual) .try_into() .map_err(|e| CoreError::Graph(format!("Failed to create reader: {e}")))?; let searcher = reader.searcher(); let query_parser = QueryParser::for_index( &self.index, vec![self.name_field, self.qualified_name_field], ); let query = query_parser .parse_query(query_str) .map_err(|e| CoreError::Graph(format!("Failed to parse query: {e}")))?; let top_docs = searcher .search(&query, &TopDocs::with_limit(limit)) .map_err(|e| CoreError::Graph(format!("Search failed: {e}")))?; let mut results = Vec::new(); for (score, doc_address) in top_docs { let doc: tantivy::TantivyDocument = searcher .doc(doc_address) .map_err(|e| CoreError::Graph(format!("Failed to retrieve doc: {e}")))?; let get_field = |field: tantivy::schema::Field| -> String { doc.get_first(field) .and_then(|v| v.as_str()) .unwrap_or("") .to_string() }; results.push(SearchResult { qualified_name: get_field(self.qualified_name_field), name: get_field(self.name_field), kind: get_field(self.kind_field), file_path: get_field(self.file_path_field), language: get_field(self.language_field), score, }); } Ok(results) } } #[cfg(test)] mod tests { use super::*; use compliance_core::models::graph::CodeNodeKind; fn make_node( qualified_name: &str, name: &str, kind: CodeNodeKind, file_path: &str, language: &str, ) -> CodeNode { CodeNode { id: None, repo_id: "test".to_string(), graph_build_id: "build1".to_string(), qualified_name: qualified_name.to_string(), name: name.to_string(), kind, file_path: file_path.to_string(), start_line: 1, end_line: 10, language: language.to_string(), community_id: None, is_entry_point: false, graph_index: None, } } #[test] fn test_new_creates_index() { let index = SymbolIndex::new(); assert!(index.is_ok()); } #[test] fn test_index_empty_nodes() { let index = SymbolIndex::new().unwrap(); let result = index.index_nodes(&[]); assert!(result.is_ok()); } #[test] fn test_index_and_search_single_node() { let index = SymbolIndex::new().unwrap(); let nodes = vec![make_node( "src/main.rs::main", "main", CodeNodeKind::Function, "src/main.rs", "rust", )]; index.index_nodes(&nodes).unwrap(); let results = index.search("main", 10).unwrap(); assert!(!results.is_empty()); assert_eq!(results[0].name, "main"); assert_eq!(results[0].qualified_name, "src/main.rs::main"); } #[test] fn test_search_no_results() { let index = SymbolIndex::new().unwrap(); let nodes = vec![make_node( "src/main.rs::foo", "foo", CodeNodeKind::Function, "src/main.rs", "rust", )]; index.index_nodes(&nodes).unwrap(); let results = index.search("zzzznonexistent", 10).unwrap(); assert!(results.is_empty()); } #[test] fn test_search_multiple_nodes() { let index = SymbolIndex::new().unwrap(); let nodes = vec![ make_node( "a.rs::handle_request", "handle_request", CodeNodeKind::Function, "a.rs", "rust", ), make_node( "b.rs::handle_response", "handle_response", CodeNodeKind::Function, "b.rs", "rust", ), make_node( "c.rs::process_data", "process_data", CodeNodeKind::Function, "c.rs", "rust", ), ]; index.index_nodes(&nodes).unwrap(); let results = index.search("handle", 10).unwrap(); assert!(results.len() >= 2); } #[test] fn test_search_limit() { let index = SymbolIndex::new().unwrap(); let mut nodes = Vec::new(); for i in 0..20 { nodes.push(make_node( &format!("mod::func_{i}"), &format!("func_{i}"), CodeNodeKind::Function, "mod.rs", "rust", )); } index.index_nodes(&nodes).unwrap(); let results = index.search("func", 5).unwrap(); assert!(results.len() <= 5); } #[test] fn test_search_result_has_score() { let index = SymbolIndex::new().unwrap(); let nodes = vec![make_node( "src/lib.rs::compute", "compute", CodeNodeKind::Function, "src/lib.rs", "rust", )]; index.index_nodes(&nodes).unwrap(); let results = index.search("compute", 10).unwrap(); assert!(!results.is_empty()); assert!(results[0].score > 0.0); } #[test] fn test_search_result_fields() { let index = SymbolIndex::new().unwrap(); let nodes = vec![make_node( "src/app.py::MyClass", "MyClass", CodeNodeKind::Class, "src/app.py", "python", )]; index.index_nodes(&nodes).unwrap(); let results = index.search("MyClass", 10).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].name, "MyClass"); assert_eq!(results[0].kind, "class"); assert_eq!(results[0].file_path, "src/app.py"); assert_eq!(results[0].language, "python"); } #[test] fn test_search_empty_query() { let index = SymbolIndex::new().unwrap(); let nodes = vec![make_node( "src/lib.rs::foo", "foo", CodeNodeKind::Function, "src/lib.rs", "rust", )]; index.index_nodes(&nodes).unwrap(); // Empty query may parse error or return empty - both acceptable let result = index.search("", 10); // Just verify it doesn't panic let _ = result; } }