use std::sync::Arc; use dashmap::DashMap; use mongodb::bson::doc; use mongodb::options::IndexOptions; use mongodb::{Client, Collection, IndexModel}; use sha2::{Digest, Sha256}; use compliance_core::models::*; use compliance_core::TenantContext; use crate::error::AgentError; /// Mongo enforces a 63-byte cap on database names (older clusters: 64 /// on Linux, 63 on Windows; we target the conservative limit). const MAX_DB_NAME_LEN: usize = 63; /// Hex length of the SHA-256 truncation used for the hash fallback /// tenant DB name (16 bytes → 32 hex chars). 16 bytes gives ~2^64 /// birthday-collision resistance — at our 10s-100s tenant scale this /// is effectively impossible to hit. const HASH_HEX_LEN: usize = 32; /// Largest `db_prefix` that still guarantees the hash-fallback name /// fits in the 63-byte cap: `prefix + "_" + 32 hex chars`. const MAX_PREFIX_LEN: usize = MAX_DB_NAME_LEN - 1 - HASH_HEX_LEN; /// Per-tenant Mongo connection broker (M7.2 isolation model). /// /// Holds one [`Client`] and hands out [`Database`] handles physically /// scoped to `_`. The driver is the isolation /// boundary — a handle for tenant A cannot see tenant B's documents /// because it is connected to a different database, not because of an /// application-level filter. /// /// Index creation runs idempotently the first time each tenant is seen /// in the process's lifetime. Mongo's `createIndex` is itself idempotent /// by index name; the in-memory `ensured` set just skips the round-trip. #[derive(Clone, Debug)] pub struct DatabasePool { client: Client, db_prefix: String, ensured: Arc>, } impl DatabasePool { /// Connect to the cluster and prepare to hand out tenant databases /// named `_`. /// /// Validates `db_prefix.len() <= MAX_PREFIX_LEN` so the /// hash-fallback path is provably within Mongo's 63-byte db-name /// cap. Refuses to construct a pool that could ever produce an /// over-long name. pub async fn connect(uri: &str, db_prefix: &str) -> Result { if db_prefix.len() > MAX_PREFIX_LEN { return Err(AgentError::Other(format!( "db_prefix '{db_prefix}' is {} chars; max is {MAX_PREFIX_LEN} so the \ hash-fallback tenant DB name fits Mongo's {MAX_DB_NAME_LEN}-byte cap", db_prefix.len() ))); } let client = Client::with_uri_str(uri).await?; client .database("admin") .run_command(doc! { "ping": 1 }) .await?; tracing::info!( "MongoDB cluster reachable; per-tenant pool ready (db prefix '{db_prefix}')" ); Ok(Self { client, db_prefix: db_prefix.to_string(), ensured: Arc::new(DashMap::new()), }) } /// Return a [`Database`] scoped to this tenant. Ensures indexes on /// first call per tenant (per process). Cheap on the hot path — /// subsequent calls skip the round-trip. pub async fn for_tenant(&self, ctx: &TenantContext) -> Result { self.for_tenant_id(&ctx.tenant_id).await } /// Like [`Self::for_tenant`] but accepts a bare tenant_id. /// For background paths (scheduler, webhooks, pipeline orchestrators) /// that don't have a full [`TenantContext`] but know which tenant /// they're operating on (typically resolved from a URL path, a job /// argument, or the registry). pub async fn for_tenant_id(&self, tenant_id: &str) -> Result { let db_name = self.tenant_db_name(tenant_id); let db = Database::from_database(self.client.database(&db_name)); // `DashMap::insert` returns the previous value; `None` means we // were the first writer for this tenant_id and own the // index-ensure work. if self.ensured.insert(tenant_id.to_string(), ()).is_none() { if let Err(e) = db.ensure_indexes().await { // Roll the marker back so the next request retries. self.ensured.remove(tenant_id); return Err(e); } tracing::debug!( tenant_id = %tenant_id, db_name = %db_name, "Indexes ensured for tenant database" ); } Ok(db) } /// Compute the Mongo database name for a tenant. Public for tests /// and tenant offboarding (`pool.client().database(name).drop()`). /// /// Format: `_` if it fits the 63-byte /// cap, else `_`. The /// `db_prefix` length invariant established at [`Self::connect`] /// guarantees the hash-fallback name always fits — no runtime /// assertion needed. /// /// Collision resistance: the hash fallback is a 16-byte SHA-256 /// truncation, which gives ~2^64 birthday-collision resistance. At /// our 10s–100s tenant scale the probability of two tenant_ids /// colliding is effectively zero. (8-byte truncation would have /// been ~2^32 — too close for comfort on a regulated product.) pub fn tenant_db_name(&self, tenant_id: &str) -> String { let sanitized = sanitize_tenant_id(tenant_id); let natural = format!("{}_{}", self.db_prefix, sanitized); if natural.len() <= MAX_DB_NAME_LEN { natural } else { let mut hasher = Sha256::new(); hasher.update(tenant_id.as_bytes()); let digest = hasher.finalize(); let suffix = hex::encode(&digest[..HASH_HEX_LEN / 2]); format!("{}_{}", self.db_prefix, suffix) } } /// Raw client handle. Reserved for cross-tenant admin flows that /// must opt in explicitly (tenant listing, drop-on-offboard). pub fn client(&self) -> &Client { &self.client } /// List every Mongo database currently belonging to this pool, /// identified by the `_` prefix. The result is the raw /// database names — opening one for offboarding/cleanup goes /// through [`Self::client`]. /// /// Note: hashed-fallback names (very long tenant_ids) lose the /// original tenant_id at the cluster level — we know a database /// exists for *some* tenant but not which one. In practice /// tenant_ids are UUIDs (36 chars) and never hit the fallback, /// so this is a theoretical concern, not an operational one. pub async fn list_tenant_db_names(&self) -> Result, AgentError> { let prefix = format!("{}_", self.db_prefix); let names = self.client.list_database_names().await?; Ok(names .into_iter() .filter(|n| n.starts_with(&prefix)) .collect()) } /// Drop the database for a specific tenant. Used by GDPR delete /// and tenant offboarding. Idempotent — dropping a non-existent /// database is a no-op at the driver level. /// /// Also evicts the tenant from the in-memory `ensured` set so a /// later re-provision triggers fresh `ensure_indexes`. pub async fn drop_tenant(&self, tenant_id: &str) -> Result<(), AgentError> { let db_name = self.tenant_db_name(tenant_id); self.client.database(&db_name).drop().await?; self.ensured.remove(tenant_id); tracing::info!( tenant_id = %tenant_id, db_name = %db_name, "Dropped tenant database" ); Ok(()) } } /// Mongo database names disallow `/`, `\`, `.`, `"`, `$`, ` `, and NUL. /// breakpilot-dev tenant_ids are UUIDs so this is belt-and-braces, but /// it lets the pool tolerate any future tenant_id shape without surprise. fn sanitize_tenant_id(tenant_id: &str) -> String { tenant_id .chars() .map(|c| match c { '/' | '\\' | '.' | '"' | '$' | ' ' | '\0' => '_', c => c, }) .collect() } #[derive(Clone, Debug)] pub struct Database { inner: mongodb::Database, } impl Database { pub async fn connect(uri: &str, db_name: &str) -> Result { let client = Client::with_uri_str(uri).await?; let db = client.database(db_name); db.run_command(doc! { "ping": 1 }).await?; tracing::info!("Connected to MongoDB database '{db_name}'"); Ok(Self { inner: db }) } /// Wrap an already-resolved Mongo database. Used by [`DatabasePool`] /// to hand out tenant-scoped handles without a fresh client per tenant. pub(crate) fn from_database(inner: mongodb::Database) -> Self { Self { inner } } pub async fn ensure_indexes(&self) -> Result<(), AgentError> { // repositories: unique git_url self.repositories() .create_index( IndexModel::builder() .keys(doc! { "git_url": 1 }) .options(IndexOptions::builder().unique(true).build()) .build(), ) .await?; // findings: unique fingerprint self.findings() .create_index( IndexModel::builder() .keys(doc! { "fingerprint": 1 }) .options(IndexOptions::builder().unique(true).build()) .build(), ) .await?; // findings: repo_id + severity compound self.findings() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "severity": 1 }) .build(), ) .await?; // scan_runs: repo_id + started_at descending self.scan_runs() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "started_at": -1 }) .build(), ) .await?; // sbom_entries: compound self.sbom_entries() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "name": 1, "version": 1 }) .build(), ) .await?; // cve_alerts: unique cve_id + repo_id self.cve_alerts() .create_index( IndexModel::builder() .keys(doc! { "cve_id": 1, "repo_id": 1 }) .options(IndexOptions::builder().unique(true).build()) .build(), ) .await?; // cve_notifications: unique cve_id + repo_id + package, status filter self.cve_notifications() .create_index( IndexModel::builder() .keys( doc! { "cve_id": 1, "repo_id": 1, "package_name": 1, "package_version": 1 }, ) .options(IndexOptions::builder().unique(true).build()) .build(), ) .await?; self.cve_notifications() .create_index( IndexModel::builder() .keys(doc! { "status": 1, "created_at": -1 }) .build(), ) .await?; // tracker_issues: unique finding_id self.tracker_issues() .create_index( IndexModel::builder() .keys(doc! { "finding_id": 1 }) .options(IndexOptions::builder().unique(true).build()) .build(), ) .await?; // graph_nodes: compound (repo_id, graph_build_id) self.graph_nodes() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "graph_build_id": 1 }) .build(), ) .await?; // graph_edges: compound (repo_id, graph_build_id) self.graph_edges() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "graph_build_id": 1 }) .build(), ) .await?; // graph_builds: compound (repo_id, started_at DESC) self.graph_builds() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "started_at": -1 }) .build(), ) .await?; // impact_analyses: unique (repo_id, finding_id) self.impact_analyses() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "finding_id": 1 }) .options(IndexOptions::builder().unique(true).build()) .build(), ) .await?; // dast_targets: index on repo_id self.dast_targets() .create_index(IndexModel::builder().keys(doc! { "repo_id": 1 }).build()) .await?; // dast_scan_runs: compound (target_id, started_at DESC) self.dast_scan_runs() .create_index( IndexModel::builder() .keys(doc! { "target_id": 1, "started_at": -1 }) .build(), ) .await?; // dast_findings: compound (scan_run_id, vuln_type) self.dast_findings() .create_index( IndexModel::builder() .keys(doc! { "scan_run_id": 1, "vuln_type": 1 }) .build(), ) .await?; // code_embeddings: compound (repo_id, graph_build_id) self.code_embeddings() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "graph_build_id": 1 }) .build(), ) .await?; // embedding_builds: compound (repo_id, started_at DESC) self.embedding_builds() .create_index( IndexModel::builder() .keys(doc! { "repo_id": 1, "started_at": -1 }) .build(), ) .await?; // pentest_sessions: compound (target_id, started_at DESC) self.pentest_sessions() .create_index( IndexModel::builder() .keys(doc! { "target_id": 1, "started_at": -1 }) .build(), ) .await?; // pentest_sessions: status index self.pentest_sessions() .create_index(IndexModel::builder().keys(doc! { "status": 1 }).build()) .await?; // attack_chain_nodes: compound (session_id, node_id) self.attack_chain_nodes() .create_index( IndexModel::builder() .keys(doc! { "session_id": 1, "node_id": 1 }) .build(), ) .await?; // pentest_messages: compound (session_id, created_at) self.pentest_messages() .create_index( IndexModel::builder() .keys(doc! { "session_id": 1, "created_at": 1 }) .build(), ) .await?; tracing::info!("Database indexes ensured"); Ok(()) } pub fn repositories(&self) -> Collection { self.inner.collection("repositories") } pub fn findings(&self) -> Collection { self.inner.collection("findings") } pub fn scan_runs(&self) -> Collection { self.inner.collection("scan_runs") } pub fn sbom_entries(&self) -> Collection { self.inner.collection("sbom_entries") } pub fn cve_alerts(&self) -> Collection { self.inner.collection("cve_alerts") } pub fn cve_notifications( &self, ) -> Collection { self.inner.collection("cve_notifications") } pub fn tracker_issues(&self) -> Collection { self.inner.collection("tracker_issues") } // Graph collections pub fn graph_nodes(&self) -> Collection { self.inner.collection("graph_nodes") } pub fn graph_edges(&self) -> Collection { self.inner.collection("graph_edges") } pub fn graph_builds(&self) -> Collection { self.inner.collection("graph_builds") } pub fn impact_analyses(&self) -> Collection { self.inner.collection("impact_analyses") } // DAST collections pub fn dast_targets(&self) -> Collection { self.inner.collection("dast_targets") } pub fn dast_scan_runs(&self) -> Collection { self.inner.collection("dast_scan_runs") } pub fn dast_findings(&self) -> Collection { self.inner.collection("dast_findings") } // Embedding collections pub fn code_embeddings(&self) -> Collection { self.inner.collection("code_embeddings") } pub fn embedding_builds( &self, ) -> Collection { self.inner.collection("embedding_builds") } // Pentest collections pub fn pentest_sessions(&self) -> Collection { self.inner.collection("pentest_sessions") } pub fn attack_chain_nodes(&self) -> Collection { self.inner.collection("attack_chain_nodes") } pub fn pentest_messages(&self) -> Collection { self.inner.collection("pentest_messages") } #[allow(dead_code)] pub fn raw_collection(&self, name: &str) -> Collection { self.inner.collection(name) } /// Get the raw MongoDB database handle (for graph persistence) pub fn inner(&self) -> &mongodb::Database { &self.inner } }