compliance-scanner-agent/compliance-agent/src/database.rs

use std::sync::Arc;

use dashmap::DashMap;
use mongodb::bson::doc;
use mongodb::options::IndexOptions;
use mongodb::{Client, Collection, IndexModel};
use sha2::{Digest, Sha256};

use compliance_core::models::*;
use compliance_core::TenantContext;

use crate::error::AgentError;

/// Mongo enforces a 63-byte cap on database names (older clusters: 64
/// on Linux, 63 on Windows; we target the conservative limit).
const MAX_DB_NAME_LEN: usize = 63;

/// Hex length of the SHA-256 truncation used for the hash fallback
/// tenant DB name (16 bytes → 32 hex chars). 16 bytes gives ~2^64
/// birthday-collision resistance — at our 10s-100s tenant scale this
/// is effectively impossible to hit.
const HASH_HEX_LEN: usize = 32;

/// Largest `db_prefix` that still guarantees the hash-fallback name
/// fits in the 63-byte cap: `prefix + "_" + 32 hex chars`.
const MAX_PREFIX_LEN: usize = MAX_DB_NAME_LEN - 1 - HASH_HEX_LEN;

/// Per-tenant Mongo connection broker (M7.2 isolation model).
///
/// Holds one [`Client`] and hands out [`Database`] handles physically
/// scoped to `<db_prefix>_<tenant_id>`. The driver is the isolation
/// boundary — a handle for tenant A cannot see tenant B's documents
/// because it is connected to a different database, not because of an
/// application-level filter.
///
/// Index creation runs idempotently the first time each tenant is seen
/// in the process's lifetime. Mongo's `createIndex` is itself idempotent
/// by index name; the in-memory `ensured` set just skips the round-trip.
#[derive(Clone, Debug)]
pub struct DatabasePool {
    client: Client,
    db_prefix: String,
    ensured: Arc<DashMap<String, ()>>,
}

impl DatabasePool {
    /// Connect to the cluster and prepare to hand out tenant databases
    /// named `<db_prefix>_<tenant_id>`.
    ///
    /// Validates `db_prefix.len() <= MAX_PREFIX_LEN` so the
    /// hash-fallback path is provably within Mongo's 63-byte db-name
    /// cap. Refuses to construct a pool that could ever produce an
    /// over-long name.
    pub async fn connect(uri: &str, db_prefix: &str) -> Result<Self, AgentError> {
        if db_prefix.len() > MAX_PREFIX_LEN {
            return Err(AgentError::Other(format!(
                "db_prefix '{db_prefix}' is {} chars; max is {MAX_PREFIX_LEN} so the \
                 hash-fallback tenant DB name fits Mongo's {MAX_DB_NAME_LEN}-byte cap",
                db_prefix.len()
            )));
        }
        let client = Client::with_uri_str(uri).await?;
        client
            .database("admin")
            .run_command(doc! { "ping": 1 })
            .await?;
        tracing::info!(
            "MongoDB cluster reachable; per-tenant pool ready (db prefix '{db_prefix}')"
        );
        Ok(Self {
            client,
            db_prefix: db_prefix.to_string(),
            ensured: Arc::new(DashMap::new()),
        })
    }

    /// Return a [`Database`] scoped to this tenant. Ensures indexes on
    /// first call per tenant (per process). Cheap on the hot path —
    /// subsequent calls skip the round-trip.
    pub async fn for_tenant(&self, ctx: &TenantContext) -> Result<Database, AgentError> {
        self.for_tenant_id(&ctx.tenant_id).await
    }

    /// Like [`Self::for_tenant`] but accepts a bare tenant_id.
    /// For background paths (scheduler, webhooks, pipeline orchestrators)
    /// that don't have a full [`TenantContext`] but know which tenant
    /// they're operating on (typically resolved from a URL path, a job
    /// argument, or the registry).
    pub async fn for_tenant_id(&self, tenant_id: &str) -> Result<Database, AgentError> {
        let db_name = self.tenant_db_name(tenant_id);
        let db = Database::from_database(self.client.database(&db_name));
        // `DashMap::insert` returns the previous value; `None` means we
        // were the first writer for this tenant_id and own the
        // index-ensure work.
        if self.ensured.insert(tenant_id.to_string(), ()).is_none() {
            if let Err(e) = db.ensure_indexes().await {
                // Roll the marker back so the next request retries.
                self.ensured.remove(tenant_id);
                return Err(e);
            }
            tracing::debug!(
                tenant_id = %tenant_id,
                db_name = %db_name,
                "Indexes ensured for tenant database"
            );
        }
        Ok(db)
    }

    /// Compute the Mongo database name for a tenant. Public for tests
    /// and tenant offboarding (`pool.client().database(name).drop()`).
    ///
    /// Format: `<prefix>_<sanitized_tenant_id>` if it fits the 63-byte
    /// cap, else `<prefix>_<sha256-16-byte-hex-of-tenant_id>`. The
    /// `db_prefix` length invariant established at [`Self::connect`]
    /// guarantees the hash-fallback name always fits — no runtime
    /// assertion needed.
    ///
    /// Collision resistance: the hash fallback is a 16-byte SHA-256
    /// truncation, which gives ~2^64 birthday-collision resistance. At
    /// our 10s–100s tenant scale the probability of two tenant_ids
    /// colliding is effectively zero. (8-byte truncation would have
    /// been ~2^32 — too close for comfort on a regulated product.)
    pub fn tenant_db_name(&self, tenant_id: &str) -> String {
        let sanitized = sanitize_tenant_id(tenant_id);
        let natural = format!("{}_{}", self.db_prefix, sanitized);
        if natural.len() <= MAX_DB_NAME_LEN {
            natural
        } else {
            let mut hasher = Sha256::new();
            hasher.update(tenant_id.as_bytes());
            let digest = hasher.finalize();
            let suffix = hex::encode(&digest[..HASH_HEX_LEN / 2]);
            format!("{}_{}", self.db_prefix, suffix)
        }
    }

    /// Raw client handle. Reserved for cross-tenant admin flows that
    /// must opt in explicitly (tenant listing, drop-on-offboard).
    pub fn client(&self) -> &Client {
        &self.client
    }

    /// List every Mongo database currently belonging to this pool,
    /// identified by the `<db_prefix>_` prefix. The result is the raw
    /// database names — opening one for offboarding/cleanup goes
    /// through [`Self::client`].
    ///
    /// Note: hashed-fallback names (very long tenant_ids) lose the
    /// original tenant_id at the cluster level — we know a database
    /// exists for *some* tenant but not which one. In practice
    /// tenant_ids are UUIDs (36 chars) and never hit the fallback,
    /// so this is a theoretical concern, not an operational one.
    pub async fn list_tenant_db_names(&self) -> Result<Vec<String>, AgentError> {
        let prefix = format!("{}_", self.db_prefix);
        let names = self.client.list_database_names().await?;
        Ok(names
            .into_iter()
            .filter(|n| n.starts_with(&prefix))
            .collect())
    }

    /// Drop the database for a specific tenant. Used by GDPR delete
    /// and tenant offboarding. Idempotent — dropping a non-existent
    /// database is a no-op at the driver level.
    ///
    /// Also evicts the tenant from the in-memory `ensured` set so a
    /// later re-provision triggers fresh `ensure_indexes`.
    pub async fn drop_tenant(&self, tenant_id: &str) -> Result<(), AgentError> {
        let db_name = self.tenant_db_name(tenant_id);
        self.client.database(&db_name).drop().await?;
        self.ensured.remove(tenant_id);
        tracing::info!(
            tenant_id = %tenant_id,
            db_name = %db_name,
            "Dropped tenant database"
        );
        Ok(())
    }
}

/// Mongo database names disallow `/`, `\`, `.`, `"`, `$`, ` `, and NUL.
/// breakpilot-dev tenant_ids are UUIDs so this is belt-and-braces, but
/// it lets the pool tolerate any future tenant_id shape without surprise.
fn sanitize_tenant_id(tenant_id: &str) -> String {
    tenant_id
        .chars()
        .map(|c| match c {
            '/' | '\\' | '.' | '"' | '$' | ' ' | '\0' => '_',
            c => c,
        })
        .collect()
}

#[derive(Clone, Debug)]
pub struct Database {
    inner: mongodb::Database,
}

impl Database {
    pub async fn connect(uri: &str, db_name: &str) -> Result<Self, AgentError> {
        let client = Client::with_uri_str(uri).await?;
        let db = client.database(db_name);
        db.run_command(doc! { "ping": 1 }).await?;
        tracing::info!("Connected to MongoDB database '{db_name}'");
        Ok(Self { inner: db })
    }

    /// Wrap an already-resolved Mongo database. Used by [`DatabasePool`]
    /// to hand out tenant-scoped handles without a fresh client per tenant.
    pub(crate) fn from_database(inner: mongodb::Database) -> Self {
        Self { inner }
    }

    pub async fn ensure_indexes(&self) -> Result<(), AgentError> {
        // repositories: unique git_url
        self.repositories()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "git_url": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // findings: unique fingerprint
        self.findings()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "fingerprint": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // findings: repo_id + severity compound
        self.findings()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "severity": 1 })
                    .build(),
            )
            .await?;

        // scan_runs: repo_id + started_at descending
        self.scan_runs()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // sbom_entries: compound
        self.sbom_entries()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "name": 1, "version": 1 })
                    .build(),
            )
            .await?;

        // cve_alerts: unique cve_id + repo_id
        self.cve_alerts()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "cve_id": 1, "repo_id": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // cve_notifications: unique cve_id + repo_id + package, status filter
        self.cve_notifications()
            .create_index(
                IndexModel::builder()
                    .keys(
                        doc! { "cve_id": 1, "repo_id": 1, "package_name": 1, "package_version": 1 },
                    )
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;
        self.cve_notifications()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "status": 1, "created_at": -1 })
                    .build(),
            )
            .await?;

        // tracker_issues: unique finding_id
        self.tracker_issues()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "finding_id": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // graph_nodes: compound (repo_id, graph_build_id)
        self.graph_nodes()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "graph_build_id": 1 })
                    .build(),
            )
            .await?;

        // graph_edges: compound (repo_id, graph_build_id)
        self.graph_edges()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "graph_build_id": 1 })
                    .build(),
            )
            .await?;

        // graph_builds: compound (repo_id, started_at DESC)
        self.graph_builds()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // impact_analyses: unique (repo_id, finding_id)
        self.impact_analyses()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "finding_id": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // dast_targets: index on repo_id
        self.dast_targets()
            .create_index(IndexModel::builder().keys(doc! { "repo_id": 1 }).build())
            .await?;

        // dast_scan_runs: compound (target_id, started_at DESC)
        self.dast_scan_runs()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "target_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // dast_findings: compound (scan_run_id, vuln_type)
        self.dast_findings()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "scan_run_id": 1, "vuln_type": 1 })
                    .build(),
            )
            .await?;

        // code_embeddings: compound (repo_id, graph_build_id)
        self.code_embeddings()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "graph_build_id": 1 })
                    .build(),
            )
            .await?;

        // embedding_builds: compound (repo_id, started_at DESC)
        self.embedding_builds()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // pentest_sessions: compound (target_id, started_at DESC)
        self.pentest_sessions()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "target_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // pentest_sessions: status index
        self.pentest_sessions()
            .create_index(IndexModel::builder().keys(doc! { "status": 1 }).build())
            .await?;

        // attack_chain_nodes: compound (session_id, node_id)
        self.attack_chain_nodes()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "session_id": 1, "node_id": 1 })
                    .build(),
            )
            .await?;

        // pentest_messages: compound (session_id, created_at)
        self.pentest_messages()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "session_id": 1, "created_at": 1 })
                    .build(),
            )
            .await?;

        tracing::info!("Database indexes ensured");
        Ok(())
    }

    pub fn repositories(&self) -> Collection<TrackedRepository> {
        self.inner.collection("repositories")
    }

    pub fn findings(&self) -> Collection<Finding> {
        self.inner.collection("findings")
    }

    pub fn scan_runs(&self) -> Collection<ScanRun> {
        self.inner.collection("scan_runs")
    }

    pub fn sbom_entries(&self) -> Collection<SbomEntry> {
        self.inner.collection("sbom_entries")
    }

    pub fn cve_alerts(&self) -> Collection<CveAlert> {
        self.inner.collection("cve_alerts")
    }

    pub fn cve_notifications(
        &self,
    ) -> Collection<compliance_core::models::notification::CveNotification> {
        self.inner.collection("cve_notifications")
    }

    pub fn tracker_issues(&self) -> Collection<TrackerIssue> {
        self.inner.collection("tracker_issues")
    }

    // Graph collections
    pub fn graph_nodes(&self) -> Collection<compliance_core::models::graph::CodeNode> {
        self.inner.collection("graph_nodes")
    }

    pub fn graph_edges(&self) -> Collection<compliance_core::models::graph::CodeEdge> {
        self.inner.collection("graph_edges")
    }

    pub fn graph_builds(&self) -> Collection<compliance_core::models::graph::GraphBuildRun> {
        self.inner.collection("graph_builds")
    }

    pub fn impact_analyses(&self) -> Collection<compliance_core::models::graph::ImpactAnalysis> {
        self.inner.collection("impact_analyses")
    }

    // DAST collections
    pub fn dast_targets(&self) -> Collection<DastTarget> {
        self.inner.collection("dast_targets")
    }

    pub fn dast_scan_runs(&self) -> Collection<DastScanRun> {
        self.inner.collection("dast_scan_runs")
    }

    pub fn dast_findings(&self) -> Collection<DastFinding> {
        self.inner.collection("dast_findings")
    }

    // Embedding collections
    pub fn code_embeddings(&self) -> Collection<compliance_core::models::embedding::CodeEmbedding> {
        self.inner.collection("code_embeddings")
    }

    pub fn embedding_builds(
        &self,
    ) -> Collection<compliance_core::models::embedding::EmbeddingBuildRun> {
        self.inner.collection("embedding_builds")
    }

    // Pentest collections
    pub fn pentest_sessions(&self) -> Collection<PentestSession> {
        self.inner.collection("pentest_sessions")
    }

    pub fn attack_chain_nodes(&self) -> Collection<AttackChainNode> {
        self.inner.collection("attack_chain_nodes")
    }

    pub fn pentest_messages(&self) -> Collection<PentestMessage> {
        self.inner.collection("pentest_messages")
    }

    #[allow(dead_code)]
    pub fn raw_collection(&self, name: &str) -> Collection<mongodb::bson::Document> {
        self.inner.collection(name)
    }

    /// Get the raw MongoDB database handle (for graph persistence)
    pub fn inner(&self) -> &mongodb::Database {
        &self.inner
    }
}