compliance-scanner-agent/compliance-agent/src/database.rs

use std::sync::Arc;

use dashmap::DashMap;
use mongodb::bson::doc;
use mongodb::options::IndexOptions;
use mongodb::{Client, Collection, IndexModel};
use sha2::{Digest, Sha256};

use compliance_core::models::*;
use compliance_core::TenantContext;

use crate::error::AgentError;

/// Mongo enforces a 63-byte cap on database names (older clusters: 64
/// on Linux, 63 on Windows; we target the conservative limit).
const MAX_DB_NAME_LEN: usize = 63;

/// Per-tenant Mongo connection broker (M7.2 isolation model).
///
/// Holds one [`Client`] and hands out [`Database`] handles physically
/// scoped to `<db_prefix>_<tenant_id>`. The driver is the isolation
/// boundary — a handle for tenant A cannot see tenant B's documents
/// because it is connected to a different database, not because of an
/// application-level filter.
///
/// Index creation runs idempotently the first time each tenant is seen
/// in the process's lifetime. Mongo's `createIndex` is itself idempotent
/// by index name; the in-memory `ensured` set just skips the round-trip.
#[derive(Clone, Debug)]
pub struct DatabasePool {
    client: Client,
    db_prefix: String,
    ensured: Arc<DashMap<String, ()>>,
}

impl DatabasePool {
    /// Connect to the cluster and prepare to hand out tenant databases
    /// named `<db_prefix>_<tenant_id>`.
    pub async fn connect(uri: &str, db_prefix: &str) -> Result<Self, AgentError> {
        let client = Client::with_uri_str(uri).await?;
        client
            .database("admin")
            .run_command(doc! { "ping": 1 })
            .await?;
        tracing::info!(
            "MongoDB cluster reachable; per-tenant pool ready (db prefix '{db_prefix}')"
        );
        Ok(Self {
            client,
            db_prefix: db_prefix.to_string(),
            ensured: Arc::new(DashMap::new()),
        })
    }

    /// Return a [`Database`] scoped to this tenant. Ensures indexes on
    /// first call per tenant (per process). Cheap on the hot path —
    /// subsequent calls skip the round-trip.
    pub async fn for_tenant(&self, ctx: &TenantContext) -> Result<Database, AgentError> {
        let db_name = self.tenant_db_name(&ctx.tenant_id);
        let db = Database::from_database(self.client.database(&db_name));
        // `DashMap::insert` returns the previous value; `None` means we
        // were the first writer for this tenant_id and own the
        // index-ensure work.
        if self.ensured.insert(ctx.tenant_id.clone(), ()).is_none() {
            if let Err(e) = db.ensure_indexes().await {
                // Roll the marker back so the next request retries.
                self.ensured.remove(&ctx.tenant_id);
                return Err(e);
            }
            tracing::debug!(
                tenant_id = %ctx.tenant_id,
                db_name = %db_name,
                "Indexes ensured for tenant database"
            );
        }
        Ok(db)
    }

    /// Compute the Mongo database name for a tenant. Public for tests
    /// and tenant offboarding (`pool.client().database(name).drop()`).
    ///
    /// Format: `<prefix>_<sanitized_tenant_id>` if it fits in 63 chars,
    /// otherwise `<prefix>_<sha256-16hex-of-tenant_id>`. The hash
    /// fallback is collision-resistant in practice (2^64 keyspace)
    /// while keeping the name bounded.
    pub fn tenant_db_name(&self, tenant_id: &str) -> String {
        let sanitized = sanitize_tenant_id(tenant_id);
        let natural = format!("{}_{}", self.db_prefix, sanitized);
        if natural.len() <= MAX_DB_NAME_LEN {
            natural
        } else {
            let mut hasher = Sha256::new();
            hasher.update(tenant_id.as_bytes());
            let digest = hasher.finalize();
            // 16 hex chars = 8 bytes = 64-bit truncation.
            let suffix = hex::encode(&digest[..8]);
            let hashed = format!("{}_{}", self.db_prefix, suffix);
            debug_assert!(hashed.len() <= MAX_DB_NAME_LEN);
            hashed
        }
    }

    /// Raw client handle. Reserved for cross-tenant admin flows that
    /// must opt in explicitly (tenant listing, drop-on-offboard).
    pub fn client(&self) -> &Client {
        &self.client
    }
}

/// Mongo database names disallow `/`, `\`, `.`, `"`, `$`, ` `, and NUL.
/// breakpilot-dev tenant_ids are UUIDs so this is belt-and-braces, but
/// it lets the pool tolerate any future tenant_id shape without surprise.
fn sanitize_tenant_id(tenant_id: &str) -> String {
    tenant_id
        .chars()
        .map(|c| match c {
            '/' | '\\' | '.' | '"' | '$' | ' ' | '\0' => '_',
            c => c,
        })
        .collect()
}

#[derive(Clone, Debug)]
pub struct Database {
    inner: mongodb::Database,
}

impl Database {
    pub async fn connect(uri: &str, db_name: &str) -> Result<Self, AgentError> {
        let client = Client::with_uri_str(uri).await?;
        let db = client.database(db_name);
        db.run_command(doc! { "ping": 1 }).await?;
        tracing::info!("Connected to MongoDB database '{db_name}'");
        Ok(Self { inner: db })
    }

    /// Wrap an already-resolved Mongo database. Used by [`DatabasePool`]
    /// to hand out tenant-scoped handles without a fresh client per tenant.
    pub(crate) fn from_database(inner: mongodb::Database) -> Self {
        Self { inner }
    }

    pub async fn ensure_indexes(&self) -> Result<(), AgentError> {
        // repositories: unique git_url
        self.repositories()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "git_url": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // findings: unique fingerprint
        self.findings()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "fingerprint": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // findings: repo_id + severity compound
        self.findings()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "severity": 1 })
                    .build(),
            )
            .await?;

        // scan_runs: repo_id + started_at descending
        self.scan_runs()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // sbom_entries: compound
        self.sbom_entries()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "name": 1, "version": 1 })
                    .build(),
            )
            .await?;

        // cve_alerts: unique cve_id + repo_id
        self.cve_alerts()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "cve_id": 1, "repo_id": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // cve_notifications: unique cve_id + repo_id + package, status filter
        self.cve_notifications()
            .create_index(
                IndexModel::builder()
                    .keys(
                        doc! { "cve_id": 1, "repo_id": 1, "package_name": 1, "package_version": 1 },
                    )
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;
        self.cve_notifications()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "status": 1, "created_at": -1 })
                    .build(),
            )
            .await?;

        // tracker_issues: unique finding_id
        self.tracker_issues()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "finding_id": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // graph_nodes: compound (repo_id, graph_build_id)
        self.graph_nodes()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "graph_build_id": 1 })
                    .build(),
            )
            .await?;

        // graph_edges: compound (repo_id, graph_build_id)
        self.graph_edges()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "graph_build_id": 1 })
                    .build(),
            )
            .await?;

        // graph_builds: compound (repo_id, started_at DESC)
        self.graph_builds()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // impact_analyses: unique (repo_id, finding_id)
        self.impact_analyses()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "finding_id": 1 })
                    .options(IndexOptions::builder().unique(true).build())
                    .build(),
            )
            .await?;

        // dast_targets: index on repo_id
        self.dast_targets()
            .create_index(IndexModel::builder().keys(doc! { "repo_id": 1 }).build())
            .await?;

        // dast_scan_runs: compound (target_id, started_at DESC)
        self.dast_scan_runs()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "target_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // dast_findings: compound (scan_run_id, vuln_type)
        self.dast_findings()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "scan_run_id": 1, "vuln_type": 1 })
                    .build(),
            )
            .await?;

        // code_embeddings: compound (repo_id, graph_build_id)
        self.code_embeddings()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "graph_build_id": 1 })
                    .build(),
            )
            .await?;

        // embedding_builds: compound (repo_id, started_at DESC)
        self.embedding_builds()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "repo_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // pentest_sessions: compound (target_id, started_at DESC)
        self.pentest_sessions()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "target_id": 1, "started_at": -1 })
                    .build(),
            )
            .await?;

        // pentest_sessions: status index
        self.pentest_sessions()
            .create_index(IndexModel::builder().keys(doc! { "status": 1 }).build())
            .await?;

        // attack_chain_nodes: compound (session_id, node_id)
        self.attack_chain_nodes()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "session_id": 1, "node_id": 1 })
                    .build(),
            )
            .await?;

        // pentest_messages: compound (session_id, created_at)
        self.pentest_messages()
            .create_index(
                IndexModel::builder()
                    .keys(doc! { "session_id": 1, "created_at": 1 })
                    .build(),
            )
            .await?;

        tracing::info!("Database indexes ensured");
        Ok(())
    }

    pub fn repositories(&self) -> Collection<TrackedRepository> {
        self.inner.collection("repositories")
    }

    pub fn findings(&self) -> Collection<Finding> {
        self.inner.collection("findings")
    }

    pub fn scan_runs(&self) -> Collection<ScanRun> {
        self.inner.collection("scan_runs")
    }

    pub fn sbom_entries(&self) -> Collection<SbomEntry> {
        self.inner.collection("sbom_entries")
    }

    pub fn cve_alerts(&self) -> Collection<CveAlert> {
        self.inner.collection("cve_alerts")
    }

    pub fn cve_notifications(
        &self,
    ) -> Collection<compliance_core::models::notification::CveNotification> {
        self.inner.collection("cve_notifications")
    }

    pub fn tracker_issues(&self) -> Collection<TrackerIssue> {
        self.inner.collection("tracker_issues")
    }

    // Graph collections
    pub fn graph_nodes(&self) -> Collection<compliance_core::models::graph::CodeNode> {
        self.inner.collection("graph_nodes")
    }

    pub fn graph_edges(&self) -> Collection<compliance_core::models::graph::CodeEdge> {
        self.inner.collection("graph_edges")
    }

    pub fn graph_builds(&self) -> Collection<compliance_core::models::graph::GraphBuildRun> {
        self.inner.collection("graph_builds")
    }

    pub fn impact_analyses(&self) -> Collection<compliance_core::models::graph::ImpactAnalysis> {
        self.inner.collection("impact_analyses")
    }

    // DAST collections
    pub fn dast_targets(&self) -> Collection<DastTarget> {
        self.inner.collection("dast_targets")
    }

    pub fn dast_scan_runs(&self) -> Collection<DastScanRun> {
        self.inner.collection("dast_scan_runs")
    }

    pub fn dast_findings(&self) -> Collection<DastFinding> {
        self.inner.collection("dast_findings")
    }

    // Embedding collections
    pub fn code_embeddings(&self) -> Collection<compliance_core::models::embedding::CodeEmbedding> {
        self.inner.collection("code_embeddings")
    }

    pub fn embedding_builds(
        &self,
    ) -> Collection<compliance_core::models::embedding::EmbeddingBuildRun> {
        self.inner.collection("embedding_builds")
    }

    // Pentest collections
    pub fn pentest_sessions(&self) -> Collection<PentestSession> {
        self.inner.collection("pentest_sessions")
    }

    pub fn attack_chain_nodes(&self) -> Collection<AttackChainNode> {
        self.inner.collection("attack_chain_nodes")
    }

    pub fn pentest_messages(&self) -> Collection<PentestMessage> {
        self.inner.collection("pentest_messages")
    }

    #[allow(dead_code)]
    pub fn raw_collection(&self, name: &str) -> Collection<mongodb::bson::Document> {
        self.inner.collection(name)
    }

    /// Get the raw MongoDB database handle (for graph persistence)
    pub fn inner(&self) -> &mongodb::Database {
        &self.inner
    }
}