Implement end-to-end RAG pipeline: AST-aware code chunking, LiteLLM embedding generation, MongoDB vector storage with brute-force cosine similarity fallback for self-hosted instances, and a chat API with RAG-augmented responses. Add dedicated /chat/:repo_id dashboard page with embedding build controls, message history, and source reference cards. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
248 lines
7.6 KiB
Rust
248 lines
7.6 KiB
Rust
use mongodb::bson::doc;
|
|
use mongodb::options::IndexOptions;
|
|
use mongodb::{Client, Collection, IndexModel};
|
|
|
|
use compliance_core::models::*;
|
|
|
|
use crate::error::AgentError;
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct Database {
|
|
inner: mongodb::Database,
|
|
}
|
|
|
|
impl Database {
|
|
pub async fn connect(uri: &str, db_name: &str) -> Result<Self, AgentError> {
|
|
let client = Client::with_uri_str(uri).await?;
|
|
let db = client.database(db_name);
|
|
db.run_command(doc! { "ping": 1 }).await?;
|
|
tracing::info!("Connected to MongoDB database '{db_name}'");
|
|
Ok(Self { inner: db })
|
|
}
|
|
|
|
pub async fn ensure_indexes(&self) -> Result<(), AgentError> {
|
|
// repositories: unique git_url
|
|
self.repositories()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "git_url": 1 })
|
|
.options(IndexOptions::builder().unique(true).build())
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// findings: unique fingerprint
|
|
self.findings()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "fingerprint": 1 })
|
|
.options(IndexOptions::builder().unique(true).build())
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// findings: repo_id + severity compound
|
|
self.findings()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "severity": 1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// scan_runs: repo_id + started_at descending
|
|
self.scan_runs()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "started_at": -1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// sbom_entries: compound
|
|
self.sbom_entries()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "name": 1, "version": 1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// cve_alerts: unique cve_id + repo_id
|
|
self.cve_alerts()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "cve_id": 1, "repo_id": 1 })
|
|
.options(IndexOptions::builder().unique(true).build())
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// tracker_issues: unique finding_id
|
|
self.tracker_issues()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "finding_id": 1 })
|
|
.options(IndexOptions::builder().unique(true).build())
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// graph_nodes: compound (repo_id, graph_build_id)
|
|
self.graph_nodes()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "graph_build_id": 1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// graph_edges: compound (repo_id, graph_build_id)
|
|
self.graph_edges()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "graph_build_id": 1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// graph_builds: compound (repo_id, started_at DESC)
|
|
self.graph_builds()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "started_at": -1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// impact_analyses: unique (repo_id, finding_id)
|
|
self.impact_analyses()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "finding_id": 1 })
|
|
.options(IndexOptions::builder().unique(true).build())
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// dast_targets: index on repo_id
|
|
self.dast_targets()
|
|
.create_index(IndexModel::builder().keys(doc! { "repo_id": 1 }).build())
|
|
.await?;
|
|
|
|
// dast_scan_runs: compound (target_id, started_at DESC)
|
|
self.dast_scan_runs()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "target_id": 1, "started_at": -1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// dast_findings: compound (scan_run_id, vuln_type)
|
|
self.dast_findings()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "scan_run_id": 1, "vuln_type": 1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// code_embeddings: compound (repo_id, graph_build_id)
|
|
self.code_embeddings()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "graph_build_id": 1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
// embedding_builds: compound (repo_id, started_at DESC)
|
|
self.embedding_builds()
|
|
.create_index(
|
|
IndexModel::builder()
|
|
.keys(doc! { "repo_id": 1, "started_at": -1 })
|
|
.build(),
|
|
)
|
|
.await?;
|
|
|
|
tracing::info!("Database indexes ensured");
|
|
Ok(())
|
|
}
|
|
|
|
pub fn repositories(&self) -> Collection<TrackedRepository> {
|
|
self.inner.collection("repositories")
|
|
}
|
|
|
|
pub fn findings(&self) -> Collection<Finding> {
|
|
self.inner.collection("findings")
|
|
}
|
|
|
|
pub fn scan_runs(&self) -> Collection<ScanRun> {
|
|
self.inner.collection("scan_runs")
|
|
}
|
|
|
|
pub fn sbom_entries(&self) -> Collection<SbomEntry> {
|
|
self.inner.collection("sbom_entries")
|
|
}
|
|
|
|
pub fn cve_alerts(&self) -> Collection<CveAlert> {
|
|
self.inner.collection("cve_alerts")
|
|
}
|
|
|
|
pub fn tracker_issues(&self) -> Collection<TrackerIssue> {
|
|
self.inner.collection("tracker_issues")
|
|
}
|
|
|
|
// Graph collections
|
|
pub fn graph_nodes(&self) -> Collection<compliance_core::models::graph::CodeNode> {
|
|
self.inner.collection("graph_nodes")
|
|
}
|
|
|
|
pub fn graph_edges(&self) -> Collection<compliance_core::models::graph::CodeEdge> {
|
|
self.inner.collection("graph_edges")
|
|
}
|
|
|
|
pub fn graph_builds(&self) -> Collection<compliance_core::models::graph::GraphBuildRun> {
|
|
self.inner.collection("graph_builds")
|
|
}
|
|
|
|
pub fn impact_analyses(&self) -> Collection<compliance_core::models::graph::ImpactAnalysis> {
|
|
self.inner.collection("impact_analyses")
|
|
}
|
|
|
|
// DAST collections
|
|
pub fn dast_targets(&self) -> Collection<DastTarget> {
|
|
self.inner.collection("dast_targets")
|
|
}
|
|
|
|
pub fn dast_scan_runs(&self) -> Collection<DastScanRun> {
|
|
self.inner.collection("dast_scan_runs")
|
|
}
|
|
|
|
pub fn dast_findings(&self) -> Collection<DastFinding> {
|
|
self.inner.collection("dast_findings")
|
|
}
|
|
|
|
// Embedding collections
|
|
pub fn code_embeddings(&self) -> Collection<compliance_core::models::embedding::CodeEmbedding> {
|
|
self.inner.collection("code_embeddings")
|
|
}
|
|
|
|
pub fn embedding_builds(
|
|
&self,
|
|
) -> Collection<compliance_core::models::embedding::EmbeddingBuildRun> {
|
|
self.inner.collection("embedding_builds")
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
pub fn raw_collection(&self, name: &str) -> Collection<mongodb::bson::Document> {
|
|
self.inner.collection(name)
|
|
}
|
|
|
|
/// Get the raw MongoDB database handle (for graph persistence)
|
|
pub fn inner(&self) -> &mongodb::Database {
|
|
&self.inner
|
|
}
|
|
}
|