fixup(m7.2-A): validate db_prefix at connect, bump hash to 16 bytes

Addresses review feedback on the hash-fallback path. The original `debug_assert!(hashed.len() <= MAX_DB_NAME_LEN)` was a runtime hack that vanished in release builds. With an 8-byte hash truncation (~2^32 birthday-collision resistance), two tenant_ids hashing to the same suffix would silently share a database — no panic, no rollback, just cross-tenant data leak. Not acceptable for a regulated-industry product. Changes: - Bump hash truncation 8 → 16 bytes (32 hex chars). 2^64 birthday resistance — collision-impossible at our scale. - Add MAX_PREFIX_LEN (= 30) and validate db_prefix.len() at `DatabasePool::connect`. The runtime hash-fallback arithmetic is now provably within Mongo's 63-byte cap; drop the debug_assert!. - New test `connect_rejects_overlong_db_prefix` exercises the inclusive bound (30 passes, 31 fails). - Existing hash-fallback test now asserts a 32-char hex suffix + basic distinctness for two different inputs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
feat(m7.2-A): introduce per-tenant DatabasePool
2026-06-17 13:16:46 +02:00 · 2026-06-17 11:58:24 +02:00 · 2026-06-17 09:36:52 +00:00 · 2026-06-04 14:46:14 +00:00 · 2026-06-04 14:38:35 +00:00
12 changed files with 632 additions and 136 deletions
@@ -687,6 +687,7 @@ dependencies = [
 "tokio-cron-scheduler",
 "tokio-stream",
 "tokio-tungstenite 0.26.2",
+ "tower",
 "tower-http",
 "tracing",
 "tracing-subscriber",
@@ -7,7 +7,7 @@ edition = "2021"
 workspace = true

 [dependencies]
-compliance-core = { workspace = true, features = ["mongodb", "telemetry"] }
+compliance-core = { workspace = true, features = ["mongodb", "telemetry", "axum"] }
 compliance-graph = { path = "../compliance-graph" }
 compliance-dast = { path = "../compliance-dast" }
 serde = { workspace = true }
@@ -44,7 +44,8 @@ dashmap = { workspace = true }
 tokio-stream = { workspace = true }

 [dev-dependencies]
-compliance-core = { workspace = true, features = ["mongodb"] }
+compliance-core = { workspace = true, features = ["mongodb", "axum"] }
+tower = { version = "0.5", features = ["util"] }
 reqwest = { workspace = true }
 serde_json = { workspace = true }
 tokio = { workspace = true }
@@ -6,7 +6,7 @@ use tokio::sync::{broadcast, watch, Semaphore};
 use compliance_core::models::pentest::PentestEvent;
 use compliance_core::AgentConfig;

-use crate::database::Database;
+use crate::database::{Database, DatabasePool};
 use crate::llm::LlmClient;
 use crate::pipeline::orchestrator::PipelineOrchestrator;

@@ -16,7 +16,13 @@ const DEFAULT_MAX_CONCURRENT_SESSIONS: usize = 5;
 #[derive(Clone)]
 pub struct ComplianceAgent {
    pub config: AgentConfig,
+    /// Transitional single-database handle. Used by handlers that have
+    /// not yet been migrated to `db_pool.for_tenant(&ctx)` (M7.2-B/C).
+    /// Will be removed once every call site is tenant-scoped (M7.2-D).
    pub db: Database,
+    /// Per-tenant Mongo broker introduced in M7.2-A. Handlers should
+    /// prefer this and obtain a tenant-scoped [`Database`] from it.
+    pub db_pool: DatabasePool,
    pub llm: Arc<LlmClient>,
    pub http: reqwest::Client,
    /// Per-session broadcast senders for SSE streaming.
@@ -28,7 +34,7 @@ pub struct ComplianceAgent {
 }

 impl ComplianceAgent {
-    pub fn new(config: AgentConfig, db: Database) -> Self {
+    pub fn new(config: AgentConfig, db: Database, db_pool: DatabasePool) -> Self {
        let llm = Arc::new(LlmClient::new(
            config.litellm_url.clone(),
            config.litellm_api_key.clone(),
@@ -43,6 +49,7 @@ impl ComplianceAgent {
        Self {
            config,
            db,
+            db_pool,
            llm,
            http,
            session_streams: Arc::new(DashMap::new()),
@@ -1,113 +0,0 @@
-use std::sync::Arc;
-
-use axum::{
-    extract::Request,
-    middleware::Next,
-    response::{IntoResponse, Response},
-};
-use jsonwebtoken::{decode, decode_header, jwk::JwkSet, DecodingKey, Validation};
-use reqwest::StatusCode;
-use serde::Deserialize;
-use tokio::sync::RwLock;
-
-/// Cached JWKS from Keycloak for token validation.
-#[derive(Clone)]
-pub struct JwksState {
-    pub jwks: Arc<RwLock<Option<JwkSet>>>,
-    pub jwks_url: String,
-}
-
-#[derive(Debug, Deserialize)]
-struct Claims {
-    #[allow(dead_code)]
-    sub: String,
-}
-
-const PUBLIC_ENDPOINTS: &[&str] = &["/api/v1/health"];
-
-/// Middleware that validates Bearer JWT tokens against Keycloak's JWKS.
-///
-/// Skips validation for health check endpoints.
-/// If `JwksState` is not present as an extension (keycloak not configured),
-/// all requests pass through.
-pub async fn require_jwt_auth(request: Request, next: Next) -> Response {
-    let path = request.uri().path();
-
-    if PUBLIC_ENDPOINTS.contains(&path) {
-        return next.run(request).await;
-    }
-
-    let jwks_state = match request.extensions().get::<JwksState>() {
-        Some(s) => s.clone(),
-        None => return next.run(request).await,
-    };
-
-    let auth_header = match request.headers().get("authorization") {
-        Some(h) => h,
-        None => return (StatusCode::UNAUTHORIZED, "Missing authorization header").into_response(),
-    };
-
-    let token = match auth_header.to_str() {
-        Ok(s) if s.starts_with("Bearer ") => &s[7..],
-        _ => return (StatusCode::UNAUTHORIZED, "Invalid authorization header").into_response(),
-    };
-
-    match validate_token(token, &jwks_state).await {
-        Ok(()) => next.run(request).await,
-        Err(e) => {
-            tracing::warn!("JWT validation failed: {e}");
-            (StatusCode::UNAUTHORIZED, "Invalid token").into_response()
-        }
-    }
-}
-
-async fn validate_token(token: &str, state: &JwksState) -> Result<(), String> {
-    let header = decode_header(token).map_err(|e| format!("failed to decode JWT header: {e}"))?;
-
-    let kid = header
-        .kid
-        .ok_or_else(|| "JWT missing kid header".to_string())?;
-
-    let jwks = fetch_or_get_jwks(state).await?;
-
-    let jwk = jwks
-        .keys
-        .iter()
-        .find(|k| k.common.key_id.as_deref() == Some(&kid))
-        .ok_or_else(|| "no matching key found in JWKS".to_string())?;
-
-    let decoding_key =
-        DecodingKey::from_jwk(jwk).map_err(|e| format!("failed to create decoding key: {e}"))?;
-
-    let mut validation = Validation::new(header.alg);
-    validation.validate_exp = true;
-    validation.validate_aud = false;
-
-    decode::<Claims>(token, &decoding_key, &validation)
-        .map_err(|e| format!("token validation failed: {e}"))?;
-
-    Ok(())
-}
-
-async fn fetch_or_get_jwks(state: &JwksState) -> Result<JwkSet, String> {
-    {
-        let cached = state.jwks.read().await;
-        if let Some(ref jwks) = *cached {
-            return Ok(jwks.clone());
-        }
-    }
-
-    let resp = reqwest::get(&state.jwks_url)
-        .await
-        .map_err(|e| format!("failed to fetch JWKS: {e}"))?;
-
-    let jwks: JwkSet = resp
-        .json()
-        .await
-        .map_err(|e| format!("failed to parse JWKS: {e}"))?;
-
-    let mut cached = state.jwks.write().await;
-    *cached = Some(jwks.clone());
-
-    Ok(jwks)
-}
@@ -1,4 +1,3 @@
-pub mod auth_middleware;
 pub mod handlers;
 pub mod routes;
 pub mod server;
@@ -7,8 +7,9 @@ use tower_http::cors::CorsLayer;
 use tower_http::set_header::SetResponseHeaderLayer;
 use tower_http::trace::TraceLayer;

+use compliance_core::auth::{require_jwt_auth, require_tenant_status, JwksState};
+
 use crate::agent::ComplianceAgent;
-use crate::api::auth_middleware::{require_jwt_auth, JwksState};
 use crate::api::routes;
 use crate::error::AgentError;

@@ -44,9 +45,13 @@ pub async fn start_api_server(agent: ComplianceAgent, port: u16) -> Result<(), A
            jwks_url,
        };
        tracing::info!("Keycloak JWT auth enabled for realm '{kc_realm}'");
+        // Layers execute outermost-first. Extension(jwks_state) must run
+        // before require_jwt_auth so the middleware can read it; the
+        // status gate runs after JWT so TenantContext is in extensions.
        app = app
-            .layer(Extension(jwks_state))
-            .layer(middleware::from_fn(require_jwt_auth));
+            .layer(middleware::from_fn(require_tenant_status))
+            .layer(middleware::from_fn(require_jwt_auth))
+            .layer(Extension(jwks_state));
    } else {
        tracing::warn!("Keycloak not configured - API endpoints are unprotected");
    }
@@ -1,11 +1,151 @@
+use std::sync::Arc;
+
+use dashmap::DashMap;
 use mongodb::bson::doc;
 use mongodb::options::IndexOptions;
 use mongodb::{Client, Collection, IndexModel};
+use sha2::{Digest, Sha256};

 use compliance_core::models::*;
+use compliance_core::TenantContext;

 use crate::error::AgentError;

+/// Mongo enforces a 63-byte cap on database names (older clusters: 64
+/// on Linux, 63 on Windows; we target the conservative limit).
+const MAX_DB_NAME_LEN: usize = 63;
+
+/// Hex length of the SHA-256 truncation used for the hash fallback
+/// tenant DB name (16 bytes → 32 hex chars). 16 bytes gives ~2^64
+/// birthday-collision resistance — at our 10s-100s tenant scale this
+/// is effectively impossible to hit.
+const HASH_HEX_LEN: usize = 32;
+
+/// Largest `db_prefix` that still guarantees the hash-fallback name
+/// fits in the 63-byte cap: `prefix + "_" + 32 hex chars`.
+const MAX_PREFIX_LEN: usize = MAX_DB_NAME_LEN - 1 - HASH_HEX_LEN;
+
+/// Per-tenant Mongo connection broker (M7.2 isolation model).
+///
+/// Holds one [`Client`] and hands out [`Database`] handles physically
+/// scoped to `<db_prefix>_<tenant_id>`. The driver is the isolation
+/// boundary — a handle for tenant A cannot see tenant B's documents
+/// because it is connected to a different database, not because of an
+/// application-level filter.
+///
+/// Index creation runs idempotently the first time each tenant is seen
+/// in the process's lifetime. Mongo's `createIndex` is itself idempotent
+/// by index name; the in-memory `ensured` set just skips the round-trip.
+#[derive(Clone, Debug)]
+pub struct DatabasePool {
+    client: Client,
+    db_prefix: String,
+    ensured: Arc<DashMap<String, ()>>,
+}
+
+impl DatabasePool {
+    /// Connect to the cluster and prepare to hand out tenant databases
+    /// named `<db_prefix>_<tenant_id>`.
+    ///
+    /// Validates `db_prefix.len() <= MAX_PREFIX_LEN` so the
+    /// hash-fallback path is provably within Mongo's 63-byte db-name
+    /// cap. Refuses to construct a pool that could ever produce an
+    /// over-long name.
+    pub async fn connect(uri: &str, db_prefix: &str) -> Result<Self, AgentError> {
+        if db_prefix.len() > MAX_PREFIX_LEN {
+            return Err(AgentError::Other(format!(
+                "db_prefix '{db_prefix}' is {} chars; max is {MAX_PREFIX_LEN} so the \
+                 hash-fallback tenant DB name fits Mongo's {MAX_DB_NAME_LEN}-byte cap",
+                db_prefix.len()
+            )));
+        }
+        let client = Client::with_uri_str(uri).await?;
+        client
+            .database("admin")
+            .run_command(doc! { "ping": 1 })
+            .await?;
+        tracing::info!(
+            "MongoDB cluster reachable; per-tenant pool ready (db prefix '{db_prefix}')"
+        );
+        Ok(Self {
+            client,
+            db_prefix: db_prefix.to_string(),
+            ensured: Arc::new(DashMap::new()),
+        })
+    }
+
+    /// Return a [`Database`] scoped to this tenant. Ensures indexes on
+    /// first call per tenant (per process). Cheap on the hot path —
+    /// subsequent calls skip the round-trip.
+    pub async fn for_tenant(&self, ctx: &TenantContext) -> Result<Database, AgentError> {
+        let db_name = self.tenant_db_name(&ctx.tenant_id);
+        let db = Database::from_database(self.client.database(&db_name));
+        // `DashMap::insert` returns the previous value; `None` means we
+        // were the first writer for this tenant_id and own the
+        // index-ensure work.
+        if self.ensured.insert(ctx.tenant_id.clone(), ()).is_none() {
+            if let Err(e) = db.ensure_indexes().await {
+                // Roll the marker back so the next request retries.
+                self.ensured.remove(&ctx.tenant_id);
+                return Err(e);
+            }
+            tracing::debug!(
+                tenant_id = %ctx.tenant_id,
+                db_name = %db_name,
+                "Indexes ensured for tenant database"
+            );
+        }
+        Ok(db)
+    }
+
+    /// Compute the Mongo database name for a tenant. Public for tests
+    /// and tenant offboarding (`pool.client().database(name).drop()`).
+    ///
+    /// Format: `<prefix>_<sanitized_tenant_id>` if it fits the 63-byte
+    /// cap, else `<prefix>_<sha256-16-byte-hex-of-tenant_id>`. The
+    /// `db_prefix` length invariant established at [`Self::connect`]
+    /// guarantees the hash-fallback name always fits — no runtime
+    /// assertion needed.
+    ///
+    /// Collision resistance: the hash fallback is a 16-byte SHA-256
+    /// truncation, which gives ~2^64 birthday-collision resistance. At
+    /// our 10s–100s tenant scale the probability of two tenant_ids
+    /// colliding is effectively zero. (8-byte truncation would have
+    /// been ~2^32 — too close for comfort on a regulated product.)
+    pub fn tenant_db_name(&self, tenant_id: &str) -> String {
+        let sanitized = sanitize_tenant_id(tenant_id);
+        let natural = format!("{}_{}", self.db_prefix, sanitized);
+        if natural.len() <= MAX_DB_NAME_LEN {
+            natural
+        } else {
+            let mut hasher = Sha256::new();
+            hasher.update(tenant_id.as_bytes());
+            let digest = hasher.finalize();
+            let suffix = hex::encode(&digest[..HASH_HEX_LEN / 2]);
+            format!("{}_{}", self.db_prefix, suffix)
+        }
+    }
+
+    /// Raw client handle. Reserved for cross-tenant admin flows that
+    /// must opt in explicitly (tenant listing, drop-on-offboard).
+    pub fn client(&self) -> &Client {
+        &self.client
+    }
+}
+
+/// Mongo database names disallow `/`, `\`, `.`, `"`, `$`, ` `, and NUL.
+/// breakpilot-dev tenant_ids are UUIDs so this is belt-and-braces, but
+/// it lets the pool tolerate any future tenant_id shape without surprise.
+fn sanitize_tenant_id(tenant_id: &str) -> String {
+    tenant_id
+        .chars()
+        .map(|c| match c {
+            '/' | '\\' | '.' | '"' | '$' | ' ' | '\0' => '_',
+            c => c,
+        })
+        .collect()
+}
+
 #[derive(Clone, Debug)]
 pub struct Database {
    inner: mongodb::Database,
@@ -20,6 +160,12 @@ impl Database {
        Ok(Self { inner: db })
    }

+    /// Wrap an already-resolved Mongo database. Used by [`DatabasePool`]
+    /// to hand out tenant-scoped handles without a fresh client per tenant.
+    pub(crate) fn from_database(inner: mongodb::Database) -> Self {
+        Self { inner }
+    }
+
    pub async fn ensure_indexes(&self) -> Result<(), AgentError> {
        // repositories: unique git_url
        self.repositories()
@@ -28,7 +28,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let db = database::Database::connect(&config.mongodb_uri, &config.mongodb_database).await?;
    db.ensure_indexes().await?;

-    let agent = agent::ComplianceAgent::new(config.clone(), db.clone());
+    // M7.2-A: per-tenant pool. Uses `mongodb_database` as the db-name
+    // prefix so tenant databases land as `<prefix>_<tenant_id>` next to
+    // the legacy single-tenant database.
+    let db_pool =
+        database::DatabasePool::connect(&config.mongodb_uri, &config.mongodb_database).await?;
+
+    let agent = agent::ComplianceAgent::new(config.clone(), db.clone(), db_pool);

    tracing::info!("Starting scheduler...");
    let scheduler_agent = agent.clone();
@@ -7,7 +7,7 @@ use std::sync::Arc;

 use compliance_agent::agent::ComplianceAgent;
 use compliance_agent::api;
-use compliance_agent::database::Database;
+use compliance_agent::database::{Database, DatabasePool};
 use compliance_core::AgentConfig;
 use secrecy::SecretString;

@@ -33,6 +33,10 @@ impl TestServer {
            .expect("Failed to connect to MongoDB — is it running?");
        db.ensure_indexes().await.expect("Failed to create indexes");

+        let db_pool = DatabasePool::connect(&mongodb_uri, &db_name)
+            .await
+            .expect("Failed to build DatabasePool");
+
        let config = AgentConfig {
            mongodb_uri: mongodb_uri.clone(),
            mongodb_database: db_name.clone(),
@@ -69,7 +73,7 @@ impl TestServer {
            pentest_imap_password: None,
        };

-        let agent = ComplianceAgent::new(config, db);
+        let agent = ComplianceAgent::new(config, db, db_pool);

        // Build the router with the agent extension
        let app = api::routes::build_router()
@@ -0,0 +1,234 @@
+//! M7.2-A — `DatabasePool` isolation proof.
+//!
+//! Two `TenantContext`s, two databases, one client. Insert on A, query
+//! on B → empty. Insert on B, query on A → only A's docs. Proves that
+//! the per-tenant database split actually isolates at the driver level
+//! and not at "we hope we filter."
+//!
+//! Requires MongoDB. Set `TEST_MONGODB_URI` to override the default
+//! `mongodb://root:example@localhost:27017/?authSource=admin`.
+
+#![allow(clippy::expect_used, clippy::unwrap_used)]
+
+use compliance_agent::database::DatabasePool;
+use compliance_core::models::TrackedRepository;
+use compliance_core::{OrgRole, TenantContext, TenantStatus};
+use mongodb::bson::doc;
+
+fn ctx(tenant_id: &str, slug: &str) -> TenantContext {
+    TenantContext {
+        tenant_id: tenant_id.to_string(),
+        tenant_slug: slug.to_string(),
+        org_roles: vec![OrgRole::ItAdmin],
+        products: vec!["compliance-scanner".to_string()],
+        plan: "starter".to_string(),
+        status: TenantStatus::Active,
+        user_id: "u-1".to_string(),
+        user_name: None,
+    }
+}
+
+fn fixture_repo(name: &str, git_url: &str) -> TrackedRepository {
+    TrackedRepository {
+        id: None,
+        name: name.to_string(),
+        git_url: git_url.to_string(),
+        default_branch: "main".to_string(),
+        local_path: None,
+        scan_schedule: None,
+        webhook_enabled: false,
+        webhook_secret: None,
+        tracker_type: None,
+        tracker_owner: None,
+        tracker_repo: None,
+        tracker_token: None,
+        auth_token: None,
+        auth_username: None,
+        last_scanned_commit: None,
+        findings_count: 0,
+        created_at: chrono::Utc::now(),
+        updated_at: chrono::Utc::now(),
+    }
+}
+
+#[tokio::test]
+async fn pool_isolates_tenants_at_driver_level() {
+    let uri = std::env::var("TEST_MONGODB_URI")
+        .unwrap_or_else(|_| "mongodb://root:example@localhost:27017/?authSource=admin".into());
+    // Unique per run so parallel test invocations don't collide. Kept
+    // short because Mongo caps db names at 63 bytes (prefix + tenant_id).
+    let prefix = format!("m72a_{}", short_id());
+
+    let pool = DatabasePool::connect(&uri, &prefix)
+        .await
+        .expect("Failed to connect to MongoDB — is it running?");
+
+    let acme = ctx("00000000-0000-0000-0000-00000000acme", "acme");
+    let globex = ctx("00000000-0000-0000-0000-0000globex000", "globex");
+
+    let acme_db = pool.for_tenant(&acme).await.expect("acme db");
+    let globex_db = pool.for_tenant(&globex).await.expect("globex db");
+
+    // Write distinct repos into each tenant's database.
+    acme_db
+        .repositories()
+        .insert_one(fixture_repo("acme-app", "git@example.com:acme/app.git"))
+        .await
+        .expect("insert acme");
+    globex_db
+        .repositories()
+        .insert_one(fixture_repo(
+            "globex-platform",
+            "git@example.com:globex/platform.git",
+        ))
+        .await
+        .expect("insert globex");
+
+    // The point of the whole exercise: acme can ONLY see acme's repo
+    // and globex can ONLY see globex's, with no filter doc anywhere
+    // because the isolation is at the database handle, not in the query.
+    let acme_seen = collect(&acme_db).await;
+    let globex_seen = collect(&globex_db).await;
+
+    assert_eq!(acme_seen.len(), 1, "acme should see exactly its own repo");
+    assert_eq!(acme_seen[0].name, "acme-app");
+    assert_eq!(
+        globex_seen.len(),
+        1,
+        "globex should see exactly its own repo"
+    );
+    assert_eq!(globex_seen[0].name, "globex-platform");
+
+    // Sanity: the two databases really are different by name.
+    let acme_db_name = pool.tenant_db_name(&acme.tenant_id);
+    let globex_db_name = pool.tenant_db_name(&globex.tenant_id);
+    assert_ne!(acme_db_name, globex_db_name);
+    assert!(acme_db_name.starts_with(&prefix));
+
+    // Cleanup — drop both per-tenant databases.
+    pool.client()
+        .database(&acme_db_name)
+        .drop()
+        .await
+        .expect("drop acme");
+    pool.client()
+        .database(&globex_db_name)
+        .drop()
+        .await
+        .expect("drop globex");
+}
+
+#[tokio::test]
+async fn for_tenant_is_idempotent_index_creation() {
+    let uri = std::env::var("TEST_MONGODB_URI")
+        .unwrap_or_else(|_| "mongodb://root:example@localhost:27017/?authSource=admin".into());
+    let prefix = format!("m72a_{}", short_id());
+    let pool = DatabasePool::connect(&uri, &prefix).await.expect("connect");
+
+    let acme = ctx("00000000-0000-0000-0000-00000000acme", "acme");
+
+    // Second call must not fail (ensure_indexes already ran, in-memory
+    // marker is set, Mongo's createIndex is idempotent by name anyway).
+    let _ = pool.for_tenant(&acme).await.expect("first call");
+    let _ = pool.for_tenant(&acme).await.expect("second call");
+    let _ = pool.for_tenant(&acme).await.expect("third call");
+
+    // Cleanup
+    let db_name = pool.tenant_db_name(&acme.tenant_id);
+    pool.client().database(&db_name).drop().await.expect("drop");
+}
+
+#[tokio::test]
+async fn tenant_db_name_sanitizes_unsafe_characters() {
+    let uri = std::env::var("TEST_MONGODB_URI")
+        .unwrap_or_else(|_| "mongodb://root:example@localhost:27017/?authSource=admin".into());
+    let pool = DatabasePool::connect(&uri, "m72a_sanitize")
+        .await
+        .expect("connect");
+
+    // Mongo db names cannot contain `/ \ . " $ <space> NUL`. The pool
+    // must rewrite these without exploding on connect.
+    let funky = "te/n.a\\nt$id\" with spaces";
+    let name = pool.tenant_db_name(funky);
+    for c in ['/', '\\', '.', '"', '$', ' '] {
+        assert!(
+            !name.contains(c),
+            "sanitized db name still contains {c:?}: {name}"
+        );
+    }
+}
+
+#[tokio::test]
+async fn tenant_db_name_falls_back_to_hash_when_too_long() {
+    let uri = std::env::var("TEST_MONGODB_URI")
+        .unwrap_or_else(|_| "mongodb://root:example@localhost:27017/?authSource=admin".into());
+    let pool = DatabasePool::connect(&uri, "m72a_long")
+        .await
+        .expect("connect");
+
+    // 100-byte tenant_id would overflow the 63-byte db-name cap with
+    // any reasonable prefix. The pool must hash it down.
+    let huge = "x".repeat(100);
+    let name = pool.tenant_db_name(&huge);
+    assert!(name.len() <= 63, "hashed name should fit: {name}");
+    assert!(name.starts_with("m72a_long_"));
+    // The hash suffix is 32 hex chars (16-byte SHA-256 truncation).
+    let suffix = name.trim_start_matches("m72a_long_");
+    assert_eq!(
+        suffix.len(),
+        32,
+        "expected 32-hex suffix (16-byte hash), got {suffix:?}"
+    );
+    assert!(suffix.chars().all(|c| c.is_ascii_hexdigit()));
+
+    // Stable: same input → same output.
+    assert_eq!(name, pool.tenant_db_name(&huge));
+
+    // Different inputs → different outputs (collision check on a tiny
+    // sample — full birthday-resistance is a proof not a test).
+    let huge2 = "y".repeat(100);
+    assert_ne!(pool.tenant_db_name(&huge), pool.tenant_db_name(&huge2));
+}
+
+#[tokio::test]
+async fn connect_rejects_overlong_db_prefix() {
+    let uri = std::env::var("TEST_MONGODB_URI")
+        .unwrap_or_else(|_| "mongodb://root:example@localhost:27017/?authSource=admin".into());
+
+    // MAX_PREFIX_LEN is 30 (= 63 - 1 - 32). A 31-char prefix MUST be
+    // rejected at construction so the hash-fallback path can never
+    // produce an over-long db name at runtime.
+    let too_long = "a".repeat(31);
+    let err = DatabasePool::connect(&uri, &too_long).await.unwrap_err();
+    let msg = format!("{err}");
+    assert!(
+        msg.contains("max is 30") || msg.contains(&too_long),
+        "error should explain the cap: {msg}"
+    );
+
+    // Exactly 30 chars is the inclusive bound — must succeed.
+    let just_right = "a".repeat(30);
+    let _ = DatabasePool::connect(&uri, &just_right)
+        .await
+        .expect("30-char prefix should be accepted");
+}
+
+/// Short UUID slug for keeping test prefixes well under Mongo's 63-byte
+/// db-name cap.
+fn short_id() -> String {
+    uuid::Uuid::new_v4().simple().to_string()[..8].to_string()
+}
+
+/// Drain a `repositories` find cursor on the given tenant database.
+async fn collect(db: &compliance_agent::database::Database) -> Vec<TrackedRepository> {
+    let mut cursor = db
+        .repositories()
+        .find(doc! {})
+        .await
+        .expect("find repositories");
+    let mut out = Vec::new();
+    while cursor.advance().await.expect("advance") {
+        out.push(cursor.deserialize_current().expect("deserialize"));
+    }
+    out
+}
@@ -0,0 +1,122 @@
+//! M7.1 — integration tests for `compliance_core::auth::require_tenant_status`.
+//!
+//! Exercises the middleware end-to-end through an Axum router so we
+//! catch wiring bugs (extension propagation, method matching) that pure
+//! unit tests would miss.
+
+#![allow(clippy::expect_used, clippy::unwrap_used)]
+
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{Method, StatusCode},
+    middleware::{from_fn, Next},
+    response::Response,
+    routing::{get, post},
+    Router,
+};
+use compliance_core::{auth::require_tenant_status, TenantContext, TenantStatus};
+use tower::ServiceExt;
+
+fn ctx_with(status: TenantStatus) -> TenantContext {
+    TenantContext {
+        tenant_id: "t-1".to_string(),
+        tenant_slug: "acme".to_string(),
+        org_roles: vec![],
+        products: vec![],
+        plan: "starter".to_string(),
+        status,
+        user_id: "u-1".to_string(),
+        user_name: None,
+    }
+}
+
+fn router_with_ctx(ctx: Option<TenantContext>) -> Router {
+    let injector = move |mut req: Request, next: Next| {
+        let ctx = ctx.clone();
+        async move {
+            if let Some(c) = ctx {
+                req.extensions_mut().insert(c);
+            }
+            next.run(req).await
+        }
+    };
+
+    Router::new()
+        .route("/r", get(|| async { "read" }))
+        .route("/w", post(|| async { "write" }))
+        .layer(from_fn(require_tenant_status))
+        .layer(from_fn(injector))
+}
+
+async fn call(router: Router, method: Method, path: &str) -> Response {
+    let req = Request::builder()
+        .method(method)
+        .uri(path)
+        .body(Body::empty())
+        .expect("request build");
+    router.oneshot(req).await.expect("oneshot")
+}
+
+#[tokio::test]
+async fn active_tenant_can_read_and_write() {
+    let r = router_with_ctx(Some(ctx_with(TenantStatus::Active)));
+    assert_eq!(
+        call(r.clone(), Method::GET, "/r").await.status(),
+        StatusCode::OK
+    );
+    assert_eq!(call(r, Method::POST, "/w").await.status(), StatusCode::OK);
+}
+
+#[tokio::test]
+async fn trial_tenant_can_read_and_write() {
+    let r = router_with_ctx(Some(ctx_with(TenantStatus::Trial)));
+    assert_eq!(
+        call(r.clone(), Method::GET, "/r").await.status(),
+        StatusCode::OK
+    );
+    assert_eq!(call(r, Method::POST, "/w").await.status(), StatusCode::OK);
+}
+
+#[tokio::test]
+async fn demo_tenant_can_read_and_write() {
+    let r = router_with_ctx(Some(ctx_with(TenantStatus::Demo)));
+    assert_eq!(
+        call(r.clone(), Method::GET, "/r").await.status(),
+        StatusCode::OK
+    );
+    assert_eq!(call(r, Method::POST, "/w").await.status(), StatusCode::OK);
+}
+
+#[tokio::test]
+async fn frozen_tenant_can_read_but_not_write() {
+    let r = router_with_ctx(Some(ctx_with(TenantStatus::Frozen)));
+    assert_eq!(
+        call(r.clone(), Method::GET, "/r").await.status(),
+        StatusCode::OK
+    );
+    assert_eq!(
+        call(r, Method::POST, "/w").await.status(),
+        StatusCode::PAYMENT_REQUIRED
+    );
+}
+
+#[tokio::test]
+async fn archived_tenant_is_gone_on_every_method() {
+    let r = router_with_ctx(Some(ctx_with(TenantStatus::Archived)));
+    assert_eq!(
+        call(r.clone(), Method::GET, "/r").await.status(),
+        StatusCode::GONE
+    );
+    assert_eq!(call(r, Method::POST, "/w").await.status(), StatusCode::GONE);
+}
+
+#[tokio::test]
+async fn no_context_passes_through() {
+    let r = router_with_ctx(None);
+    assert_eq!(
+        call(r.clone(), Method::GET, "/r").await.status(),
+        StatusCode::OK
+    );
+    assert_eq!(call(r, Method::POST, "/w").await.status(), StatusCode::OK);
+}
@@ -148,27 +148,83 @@ async fn validate_token(token: &str, state: &JwksState) -> Result<TenantContext,

    let kid = header
        .kid
+        .clone()
        .ok_or_else(|| "JWT missing kid header".to_string())?;

-    let jwks = fetch_or_get_jwks(state).await?;
+    // First try against whatever's currently cached. If the kid isn't
+    // there or the signature doesn't verify, the cached JWKS is most
+    // likely stale (KC rotated keys) — refresh once and retry before
+    // giving up. Without this every key rotation produces a silent 401
+    // storm that only goes away when the agent restarts.
+    let jwks = fetch_or_get_jwks(state, false).await?;
+    match try_validate(token, &header, &kid, &jwks) {
+        Ok(ctx) => Ok(ctx),
+        Err(ValidationError::Permanent(e)) => Err(e),
+        Err(ValidationError::Stale(reason)) => {
+            tracing::info!(
+                kid = %kid,
+                reason = %reason,
+                "JWKS appears stale — forcing refresh and retrying"
+            );
+            let jwks = fetch_or_get_jwks(state, true).await?;
+            try_validate(token, &header, &kid, &jwks).map_err(|e| match e {
+                ValidationError::Stale(s) | ValidationError::Permanent(s) => s,
+            })
+        }
+    }
+}

-    let jwk = jwks
+#[derive(Debug)]
+enum ValidationError {
+    /// Refresh-eligible: cached JWKS may be stale.
+    Stale(String),
+    /// Refusing the token regardless of JWKS freshness.
+    Permanent(String),
+}
+
+fn try_validate(
+    token: &str,
+    header: &jsonwebtoken::Header,
+    kid: &str,
+    jwks: &JwkSet,
+) -> Result<TenantContext, ValidationError> {
+    let jwk = match jwks
        .keys
        .iter()
-        .find(|k| k.common.key_id.as_deref() == Some(&kid))
-        .ok_or_else(|| "no matching key found in JWKS".to_string())?;
+        .find(|k| k.common.key_id.as_deref() == Some(kid))
+    {
+        Some(j) => j,
+        None => {
+            return Err(ValidationError::Stale(
+                "no matching key found in JWKS".to_string(),
+            ))
+        }
+    };

-    let decoding_key =
-        DecodingKey::from_jwk(jwk).map_err(|e| format!("failed to create decoding key: {e}"))?;
+    let decoding_key = DecodingKey::from_jwk(jwk)
+        .map_err(|e| ValidationError::Permanent(format!("failed to create decoding key: {e}")))?;

    let mut validation = Validation::new(header.alg);
    validation.validate_exp = true;
    validation.validate_aud = false;

-    let data = decode::<Claims>(token, &decoding_key, &validation)
-        .map_err(|e| format!("token validation failed: {e}"))?;
+    let data = match decode::<Claims>(token, &decoding_key, &validation) {
+        Ok(d) => d,
+        Err(e) => {
+            // Signature mismatch is the other refresh-eligible failure:
+            // the matching kid is present but the key bytes don't match.
+            // Everything else (expired, malformed, etc.) is permanent.
+            return Err(
+                if matches!(e.kind(), jsonwebtoken::errors::ErrorKind::InvalidSignature) {
+                    ValidationError::Stale(format!("token validation failed: {e}"))
+                } else {
+                    ValidationError::Permanent(format!("token validation failed: {e}"))
+                },
+            );
+        }
+    };

-    claims_to_context(data.claims)
+    claims_to_context(data.claims).map_err(ValidationError::Permanent)
 }

 /// Map the decoded JWT payload into the platform-wide `TenantContext`.
@@ -198,14 +254,25 @@ fn claims_to_context(c: Claims) -> Result<TenantContext, String> {
    })
 }

-async fn fetch_or_get_jwks(state: &JwksState) -> Result<JwkSet, String> {
-    {
+async fn fetch_or_get_jwks(state: &JwksState, force: bool) -> Result<JwkSet, String> {
+    if !force {
        let cached = state.jwks.read().await;
        if let Some(ref jwks) = *cached {
            return Ok(jwks.clone());
        }
    }

+    // Hold the write lock across the fetch so concurrent refreshers
+    // don't all hammer Keycloak when keys rotate. If another writer
+    // already populated a fresh JWKS while we were waiting (and we
+    // weren't asked to force), use theirs.
+    let mut cached = state.jwks.write().await;
+    if !force {
+        if let Some(ref jwks) = *cached {
+            return Ok(jwks.clone());
+        }
+    }
+
    let resp = reqwest::get(&state.jwks_url)
        .await
        .map_err(|e| format!("failed to fetch JWKS: {e}"))?;
@@ -215,7 +282,6 @@ async fn fetch_or_get_jwks(state: &JwksState) -> Result<JwkSet, String> {
        .await
        .map_err(|e| format!("failed to parse JWKS: {e}"))?;

-    let mut cached = state.jwks.write().await;
    *cached = Some(jwks.clone());

    Ok(jwks)
@@ -293,6 +359,24 @@ mod tests {
        );
    }

+    #[test]
+    fn try_validate_returns_stale_when_kid_missing_from_jwks() {
+        // Empty JWKS — the kid we ask for can't possibly match. The error
+        // must classify as Stale so the caller refreshes JWKS and retries.
+        let jwks = JwkSet { keys: vec![] };
+        let header = jsonwebtoken::Header {
+            alg: jsonwebtoken::Algorithm::RS256,
+            kid: Some("kid-rotated-out".to_string()),
+            ..Default::default()
+        };
+        let err = try_validate("ignored.token.value", &header, "kid-rotated-out", &jwks)
+            .expect_err("should fail");
+        match err {
+            ValidationError::Stale(s) => assert!(s.contains("no matching key")),
+            ValidationError::Permanent(s) => panic!("must be Stale, got Permanent: {s}"),
+        }
+    }
+
    #[test]
    fn is_write_detects_methods() {
        assert!(!is_write(&Method::GET));
Author	SHA1	Message	Date
Sharang Parnerkar	003835764e	fixup(m7.2-A): validate db_prefix at connect, bump hash to 16 bytes CI / Check (pull_request) Successful in 8m29s Details CI / Detect Changes (pull_request) Has been skipped Details CI / Deploy Agent (pull_request) Has been skipped Details CI / Deploy Dashboard (pull_request) Has been skipped Details CI / Deploy Docs (pull_request) Has been skipped Details CI / Deploy MCP (pull_request) Has been skipped Details Addresses review feedback on the hash-fallback path. The original `debug_assert!(hashed.len() <= MAX_DB_NAME_LEN)` was a runtime hack that vanished in release builds. With an 8-byte hash truncation (~2^32 birthday-collision resistance), two tenant_ids hashing to the same suffix would silently share a database — no panic, no rollback, just cross-tenant data leak. Not acceptable for a regulated-industry product. Changes: - Bump hash truncation 8 → 16 bytes (32 hex chars). 2^64 birthday resistance — collision-impossible at our scale. - Add MAX_PREFIX_LEN (= 30) and validate db_prefix.len() at `DatabasePool::connect`. The runtime hash-fallback arithmetic is now provably within Mongo's 63-byte cap; drop the debug_assert!. - New test `connect_rejects_overlong_db_prefix` exercises the inclusive bound (30 passes, 31 fails). - Existing hash-fallback test now asserts a 32-char hex suffix + basic distinctness for two different inputs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-06-17 13:16:46 +02:00
Sharang Parnerkar	e3aabe7d18	feat(m7.2-A): introduce per-tenant DatabasePool CI / Check (pull_request) Successful in 8m40s Details CI / Detect Changes (pull_request) Has been skipped Details CI / Deploy Agent (pull_request) Has been skipped Details CI / Deploy Dashboard (pull_request) Has been skipped Details CI / Deploy Docs (pull_request) Has been skipped Details CI / Deploy MCP (pull_request) Has been skipped Details First slice of the M7.2 tenant-isolation work. Adds a `DatabasePool` that hands out per-tenant `Database` handles physically scoped to `<prefix>_<tenant_id>` Mongo databases. Isolation is at the driver, not at "we hope we filter" — a handle for tenant A literally cannot see tenant B's documents because it's connected to a different db. What's in this PR - DatabasePool::connect — pings the cluster, prepares per-tenant lazy handles. - DatabasePool::for_tenant(&TenantContext) — returns a Database scoped to that tenant. ensure_indexes runs once per tenant per process via a DashMap-backed marker; failure rolls the marker back so the next request retries. - tenant_db_name — `<prefix>_<sanitized_tenant_id>` if it fits in Mongo's 63-byte db-name cap, else `<prefix>_<sha256-16hex>` fallback. - Sanitizer rewrites the Mongo-disallowed chars (`/ \ . " $ <space> NUL`) so any future tenant_id shape works. - ComplianceAgent gains a `db_pool: DatabasePool` field next to the existing `db: Database`. Handlers / pipelines / webhooks still use `db` — they migrate to `db_pool.for_tenant(&ctx)` in M7.2-B/C and `db` goes away in M7.2-D. Test plan - cargo fmt --all clean - cargo clippy --workspace --exclude compliance-dashboard -- -D warnings clean - cargo test -p compliance-core --lib — 7 pass - cargo test -p compliance-agent --lib — 228 pass - cargo test -p compliance-agent --test tenant_isolation — 4 pass against live mongo on 27017: * pool_isolates_tenants_at_driver_level — writes for acme + globex, reads through each tenant's handle; each sees exactly its own data with no filter doc anywhere. * for_tenant_is_idempotent_index_creation — second + third call for the same tenant do not error. * tenant_db_name_sanitizes_unsafe_characters * tenant_db_name_falls_back_to_hash_when_too_long — 100-byte tenant_id collapses to a stable 8-byte hex suffix. Why per-tenant DB vs `tenant_id` field + filter - Driver-level isolation; impossible to forget the filter on one of the 184 query call-sites in compliance-agent. - Handlers don't change shape at migration — `agent.db.findings()` becomes `db.findings()` after pulling `db` from `agent.db_pool.for_tenant(&ctx)`. - GDPR delete = `db.dropDatabase()`. - On-prem deploy = the same code path, with one tenant. - Trade-off accepted: index storage duplicated per tenant; Mongo's ~thousand-db ceiling is way above the 10s-100s tenants we're targeting. Caveats - Existing `agent.db` continues to point at the single legacy db. Handlers / pipelines that use it are unscoped until M7.2-B/C migrate them. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-06-17 11:58:24 +02:00
sharang	183234f9af	feat(m7.1): wire compliance-agent to compliance-core auth + status gate (#85 ) CI / Check (push) Has been skipped Details CI / Detect Changes (push) Successful in 5s Details CI / Deploy Agent (push) Successful in 8m38s Details CI / Deploy Dashboard (push) Successful in 7m30s Details CI / Deploy Docs (push) Has been skipped Details CI / Deploy MCP (push) Successful in 1m55s Details	2026-06-17 09:36:52 +00:00
sharang	dbadff0aac	fix(m7.1): JWKS refresh-on-failure in auth middleware (#84 ) CI / Check (push) Has been skipped Details CI / Detect Changes (push) Successful in 3s Details CI / Deploy Agent (push) Successful in 11m44s Details CI / Deploy Dashboard (push) Successful in 13m1s Details CI / Deploy Docs (push) Has been skipped Details CI / Deploy MCP (push) Successful in 1m53s Details	2026-06-04 14:46:14 +00:00
sharang	116293519d	M7.1 smoke harness: lift auth to compliance-core + compliance-smoke service (#83 ) CI / Check (push) Has been cancelled Details CI / Detect Changes (push) Has been cancelled Details CI / Deploy Agent (push) Has been cancelled Details CI / Deploy Dashboard (push) Has been cancelled Details CI / Deploy Docs (push) Has been cancelled Details CI / Deploy MCP (push) Has been cancelled Details	2026-06-04 14:38:35 +00:00