feat(dashboard): proactively refresh expired Keycloak tokens

The dashboard stored a refresh_token in the session at login (auth.rs) but never used it. Once the access_token's 5-minute lifespan ran out, every subsequent agent call failed with 401 ExpiredSignature. The UI showed "unable to load X" until the user logged out and back in. Fix: before attaching the bearer, decode the JWT's `exp` claim and proactively refresh via the stored refresh_token if the token is expired or within REFRESH_SKEW_SECS (30s) of expiry. Updates the session with the new access_token (and rotated refresh_token if KC sends one). Refresh failures fall through with the stale token so the agent's 401 surfaces to the UI rather than failing the request at the dashboard layer. Why "proactive" instead of "retry on 401" - Saves a wasted round-trip on every agent call once the token has aged past 5 min. - Doesn't require cloning RequestBuilder bodies for retry. - Same end state — fresh token reaches the agent. Test plan - cargo test -p compliance-dashboard --features server --no-default-features infrastructure::agent_client::tests — 5 pass: * expired JWT → refresh * near-expiry within skew window → refresh * fresh JWT → no refresh * malformed/empty JWT → refresh (defensive) * JWT without exp claim → refresh (defensive) - Manual after deploy: dashboard works past the 5-min token lifespan without manual re-login. Note - The refresh code addresses the ExpiredSignature failure mode. The separate "JWT is missing tenant_id claim" 401 is a Keycloak realm config issue (the user logging in lacks the M7.1 attributes that the protocol mappers consume) and is fixed by realm/attribute config, not by this PR. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-17 21:38:06 +02:00
9 changed files with 16 additions and 358 deletions
@@ -1,115 +0,0 @@
 //! Cross-tenant admin endpoints (`/api/v1/admin/*`).
 //!
 //! Operator-only. Auth is a **static bearer token** (`ADMIN_API_TOKEN`
 //! env on the agent) — explicitly NOT a Keycloak JWT, because the
 //! whole point of these endpoints is to operate ACROSS tenants. A
 //! customer JWT (which always carries a single tenant_id) has no
 //! business mounting them.
 //!
 //! Routes are only registered when `ADMIN_API_TOKEN` is set. With no
 //! token, the endpoints don't exist at all (404), which is a stronger
 //! guarantee than "401 if you guess the path".
 //!
 //! Operations:
 //! - `GET    /api/v1/admin/tenants`              — list tenant DBs
 //! - `DELETE /api/v1/admin/tenants/{tenant_id}`  — GDPR delete
 //!
 //! Tenant ids in URLs are passed as-is to `DatabasePool::drop_tenant`,
 //! which sanitises them the same way it does for creation. Listing
 //! returns the raw DB names from `list_tenant_db_names` — operators
 //! can reverse-derive the tenant_id from the prefix.
 use axum::extract::{Extension, Path, Request};
 use axum::http::{header, StatusCode};
 use axum::middleware::Next;
 use axum::response::{IntoResponse, Response};
 use axum::Json;
 use secrecy::ExposeSecret;
 use serde::Serialize;
 use super::dto::AgentExt;
 #[derive(Serialize)]
 pub struct ListTenantDbsResponse {
    pub tenant_db_names: Vec<String>,
 }
 #[tracing::instrument(skip_all)]
 pub async fn list_tenant_dbs(
    Extension(agent): AgentExt,
 ) -> Result<Json<ListTenantDbsResponse>, StatusCode> {
    let names = agent.db_pool.list_tenant_db_names().await.map_err(|e| {
        tracing::error!("admin: list_tenant_db_names failed: {e}");
        StatusCode::INTERNAL_SERVER_ERROR
    })?;
    Ok(Json(ListTenantDbsResponse {
        tenant_db_names: names,
    }))
 }
 #[tracing::instrument(skip_all, fields(tenant_id = %tenant_id))]
 pub async fn drop_tenant_db(
    Extension(agent): AgentExt,
    Path(tenant_id): Path<String>,
 ) -> Result<Json<serde_json::Value>, StatusCode> {
    agent.db_pool.drop_tenant(&tenant_id).await.map_err(|e| {
        tracing::error!("admin: drop_tenant failed: {e}");
        StatusCode::INTERNAL_SERVER_ERROR
    })?;
    Ok(Json(serde_json::json!({ "status": "dropped" })))
 }
 /// Constant-time-ish comparison of the configured admin token against
 /// the incoming bearer. Uses `subtle`-style byte equality so timing
 /// attacks can't probe the token character by character.
 fn tokens_eq(a: &str, b: &str) -> bool {
    if a.len() != b.len() {
        return false;
    }
    let mut diff = 0u8;
    for (x, y) in a.bytes().zip(b.bytes()) {
        diff |= x ^ y;
    }
    diff == 0
 }
 /// Middleware enforcing the static `ADMIN_API_TOKEN`. Mounted only on
 /// the admin sub-router, so this never runs on customer routes.
 pub async fn require_admin_token(
    Extension(agent): AgentExt,
    request: Request,
    next: Next,
 ) -> Response {
    let Some(expected) = agent.config.admin_api_token.as_ref() else {
        // Belt-and-braces — if the routes were somehow mounted without
        // a token configured, refuse rather than no-op-pass.
        return (StatusCode::NOT_FOUND, "admin disabled").into_response();
    };
    let presented = request
        .headers()
        .get(header::AUTHORIZATION)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.strip_prefix("Bearer "))
        .map(|s| s.trim());
    let Some(presented) = presented.filter(|s| !s.is_empty()) else {
        return (StatusCode::UNAUTHORIZED, "Missing bearer token").into_response();
    };
    if !tokens_eq(presented, expected.expose_secret()) {
        return (StatusCode::UNAUTHORIZED, "Invalid admin token").into_response();
    }
    next.run(request).await
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn tokens_eq_basic() {
        assert!(tokens_eq("abc", "abc"));
        assert!(!tokens_eq("abc", "abd"));
        assert!(!tokens_eq("abc", "abcd"));
        assert!(!tokens_eq("", "x"));
        assert!(tokens_eq("", ""));
    }
 }
@@ -1,4 +1,3 @@
 pub mod admin;
 pub mod chat;
 pub mod dast;
 pub mod dto;
@@ -4,8 +4,7 @@ use axum::extract::Request;
 use axum::http::HeaderValue;
 use axum::middleware::Next;
 use axum::response::Response;
-use axum::routing::{delete, get};
+use axum::{middleware, Extension};
 use axum::{middleware, Extension, Router};
 use tokio::sync::RwLock;
 use tower_http::cors::CorsLayer;
 use tower_http::set_header::SetResponseHeaderLayer;
@@ -15,7 +14,6 @@ use compliance_core::auth::{require_jwt_auth, require_tenant_status, JwksState};
 use compliance_core::{TenantContext, TenantStatus};
 use crate::agent::ComplianceAgent;
 use crate::api::handlers;
 use crate::api::routes;
 use crate::error::AgentError;
@@ -52,28 +50,7 @@ pub async fn inject_dev_tenant(mut request: Request, next: Next) -> Response {
 }
 pub async fn start_api_server(agent: ComplianceAgent, port: u16) -> Result<(), AgentError> {
    // Admin sub-router. Routes are only mounted when ADMIN_API_TOKEN is
    // configured — without it, the paths don't exist at all (404 rather
    // than 401), so an operator who hasn't opted in can't fingerprint
    // the surface area.
    let admin_router: Router = if agent.config.admin_api_token.is_some() {
        tracing::info!("Admin API enabled — /api/v1/admin/* mounted behind ADMIN_API_TOKEN bearer");
        Router::new()
            .route(
                "/api/v1/admin/tenants",
                get(handlers::admin::list_tenant_dbs),
            )
            .route(
                "/api/v1/admin/tenants/{tenant_id}",
                delete(handlers::admin::drop_tenant_db),
            )
            .layer(middleware::from_fn(handlers::admin::require_admin_token))
    } else {
        Router::new()
    };
    let mut app = routes::build_router()
        .merge(admin_router)
        .layer(Extension(Arc::new(agent.clone())))
        .layer(CorsLayer::permissive())
        .layer(TraceLayer::new_for_http())
@@ -59,7 +59,5 @@ pub fn load_config() -> Result<AgentConfig, AgentError> {
            .unwrap_or(true),
        pentest_imap_username: env_var_opt("PENTEST_IMAP_USERNAME"),
        pentest_imap_password: env_secret_opt("PENTEST_IMAP_PASSWORD"),
        admin_api_token: env_secret_opt("ADMIN_API_TOKEN"),
        tenant_registry_url: env_var_opt("TENANT_REGISTRY_URL"),
    })
 }
@@ -339,8 +339,6 @@ mod tests {
            pentest_imap_tls: true,
            pentest_imap_username: None,
            pentest_imap_password: None,
            admin_api_token: None,
            tenant_registry_url: None,
        }
    }
@@ -7,18 +7,11 @@ use crate::agent::ComplianceAgent;
 use crate::database::Database;
 use crate::error::AgentError;
-/// Default tenant the scheduler runs against when neither the tenant
+/// Default tenant the scheduler runs against when `SCHEDULER_TENANT_IDS`
-/// registry nor `SCHEDULER_TENANT_IDS` are configured. Matches the
+/// isn't set. Matches the dev-injector default so a bare `cargo run` has
-/// dev-injector default so a bare `cargo run` has the scheduler
+/// the scheduler scanning whatever lives in `<prefix>_dev`.
 /// scanning whatever lives in `<prefix>_dev`.
 const DEFAULT_SCHEDULER_TENANT_ID: &str = "dev";
 /// Request timeout when fetching the live tenant list from the
 /// registry. Kept short — if the registry is slow we'd rather fall
 /// back to env-configured ids and finish the tick than block the
 /// scheduler loop.
 const REGISTRY_FETCH_TIMEOUT_SECS: u64 = 5;
 pub async fn start_scheduler(agent: &ComplianceAgent) -> Result<(), AgentError> {
    let sched = JobScheduler::new()
        .await
@@ -31,12 +24,7 @@ pub async fn start_scheduler(agent: &ComplianceAgent) -> Result<(), AgentError>
        let agent = scan_agent.clone();
        Box::pin(async move {
            tracing::info!("Scheduled scan triggered");
-            let tenants = scheduler_tenants(&agent).await;
+            for tenant_id in scheduler_tenants() {
            tracing::debug!(
                tenant_count = tenants.len(),
                "Scheduled scan: tenants resolved"
            );
            for tenant_id in tenants {
                scan_all_repos(&agent, &tenant_id).await;
            }
        })
@@ -54,12 +42,7 @@ pub async fn start_scheduler(agent: &ComplianceAgent) -> Result<(), AgentError>
        let agent = cve_agent.clone();
        Box::pin(async move {
            tracing::info!("CVE monitor triggered");
-            let tenants = scheduler_tenants(&agent).await;
+            for tenant_id in scheduler_tenants() {
            tracing::debug!(
                tenant_count = tenants.len(),
                "CVE monitor: tenants resolved"
            );
            for tenant_id in tenants {
                monitor_cves(&agent, &tenant_id).await;
            }
        })
@@ -75,14 +58,9 @@ pub async fn start_scheduler(agent: &ComplianceAgent) -> Result<(), AgentError>
        .await
        .map_err(|e| AgentError::Scheduler(format!("Failed to start scheduler: {e}")))?;
-    let tenants = scheduler_tenants(agent).await;
+    let tenants = scheduler_tenants();
    let source = if agent.config.tenant_registry_url.is_some() {
        "tenant-registry (env fallback)"
    } else {
        "env (SCHEDULER_TENANT_IDS)"
    };
    tracing::info!(
-        "Scheduler started: scans='{}', CVE monitor='{}', tenant source={source}, tenants={tenants:?}",
+        "Scheduler started: scans='{}', CVE monitor='{}', tenants={tenants:?}",
        agent.config.scan_schedule,
        agent.config.cve_monitor_schedule,
    );
@@ -93,40 +71,10 @@ pub async fn start_scheduler(agent: &ComplianceAgent) -> Result<(), AgentError>
    }
 }
-/// Tenants the scheduler iterates each tick.
+/// Tenants the scheduler iterates each tick. From `SCHEDULER_TENANT_IDS`
-///
+/// (comma-separated), or `DEFAULT_SCHEDULER_TENANT_ID` if unset. M7.2-D
-/// Resolution order:
+/// will replace this with a pull from the tenant-registry.
-/// 1. **Tenant registry** at `agent.config.tenant_registry_url`
+fn scheduler_tenants() -> Vec<String> {
 ///    (`GET /v1/tenants`). Fresh on every tick — picks up newly
 ///    provisioned tenants without an agent restart.
 /// 2. **`SCHEDULER_TENANT_IDS`** env (comma-separated) — fallback when
 ///    the registry is unreachable, the response is malformed, or no
 ///    registry URL is configured.
 /// 3. **`DEFAULT_SCHEDULER_TENANT_ID`** (`"dev"`) — last-ditch fallback
 ///    so the scheduler keeps doing something useful in dev.
 ///
 /// We never panic out of this function — the scheduler must keep
 /// firing even if the registry is offline.
 async fn scheduler_tenants(agent: &ComplianceAgent) -> Vec<String> {
    if let Some(url) = agent.config.tenant_registry_url.as_deref() {
        match fetch_tenants_from_registry(&agent.http, url).await {
            Ok(v) if !v.is_empty() => return v,
            Ok(_) => {
                tracing::warn!("tenant-registry returned empty list; falling back to env");
            }
            Err(e) => {
                tracing::warn!(
                    url = %url,
                    error = %e,
                    "tenant-registry fetch failed; falling back to env"
                );
            }
        }
    }
    tenants_from_env()
 }
 fn tenants_from_env() -> Vec<String> {
    std::env::var("SCHEDULER_TENANT_IDS")
        .ok()
        .map(|s| {
@@ -140,134 +88,6 @@ fn tenants_from_env() -> Vec<String> {
        .unwrap_or_else(|| vec![DEFAULT_SCHEDULER_TENANT_ID.to_string()])
 }
 /// Shape we accept from the registry. Liberal in what we accept:
 /// the registry can return any field shape as long as either `id` or
 /// `tenant_id` is present. Other fields are ignored.
 #[derive(serde::Deserialize)]
 struct RegistryTenant {
    #[serde(alias = "tenant_id")]
    id: String,
    /// Filter out non-running tenants if status is present. Missing
    /// status defaults to "active" so older registry deployments keep
    /// working.
    #[serde(default = "default_status")]
    status: String,
 }
 fn default_status() -> String {
    "active".to_string()
 }
 #[derive(serde::Deserialize)]
 struct RegistryListResponse {
    data: Vec<RegistryTenant>,
 }
 async fn fetch_tenants_from_registry(
    http: &reqwest::Client,
    base_url: &str,
 ) -> Result<Vec<String>, String> {
    let url = format!("{}/v1/tenants", base_url.trim_end_matches('/'));
    let resp = http
        .get(&url)
        .timeout(std::time::Duration::from_secs(REGISTRY_FETCH_TIMEOUT_SECS))
        .send()
        .await
        .map_err(|e| format!("request failed: {e}"))?;
    if !resp.status().is_success() {
        return Err(format!("registry returned {}", resp.status()));
    }
    let body: RegistryListResponse = resp
        .json()
        .await
        .map_err(|e| format!("invalid JSON: {e}"))?;
    Ok(filter_active(body.data))
 }
 /// Frozen/Archived tenants don't need scheduled scans; the M7.1
 /// status gate would 402/410 anyway. Skip them so we don't waste
 /// cycles. Active / trial / demo / anything-else-unknown all run.
 fn filter_active(rows: Vec<RegistryTenant>) -> Vec<String> {
    rows.into_iter()
        .filter(|t| !matches!(t.status.as_str(), "frozen" | "archived"))
        .map(|t| t.id)
        .collect()
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    fn tenant(id: &str, status: &str) -> RegistryTenant {
        RegistryTenant {
            id: id.to_string(),
            status: status.to_string(),
        }
    }
    #[test]
    fn filter_active_keeps_running_skips_frozen_archived() {
        let rows = vec![
            tenant("a", "active"),
            tenant("b", "trial"),
            tenant("c", "demo"),
            tenant("d", "frozen"),
            tenant("e", "archived"),
            tenant("f", "weird-but-not-known-dead"),
        ];
        let out = filter_active(rows);
        assert_eq!(out, vec!["a", "b", "c", "f"]);
    }
    #[test]
    fn deserialize_registry_response_accepts_id_or_tenant_id() {
        let body = r#"{"data":[
            {"id":"a","status":"active"},
            {"tenant_id":"b","status":"trial"},
            {"id":"c"}
        ]}"#;
        let parsed: RegistryListResponse = serde_json::from_str(body).unwrap();
        assert_eq!(parsed.data.len(), 3);
        assert_eq!(parsed.data[0].id, "a");
        assert_eq!(parsed.data[1].id, "b");
        assert_eq!(parsed.data[2].id, "c");
        // Default status for the third entry should be "active"
        assert_eq!(parsed.data[2].status, "active");
    }
    /// Combined into a single test: cargo runs tests in parallel and
    /// env vars are process-global, so two separate tests touching
    /// `SCHEDULER_TENANT_IDS` race each other. Doing both checks in
    /// one test keeps them in a deterministic order.
    #[test]
    fn tenants_from_env_resolution() {
        std::env::remove_var("SCHEDULER_TENANT_IDS");
        assert_eq!(
            tenants_from_env(),
            vec![DEFAULT_SCHEDULER_TENANT_ID.to_string()],
            "unset → default"
        );
        std::env::set_var("SCHEDULER_TENANT_IDS", "acme, globex ,,hello");
        let out = tenants_from_env();
        std::env::remove_var("SCHEDULER_TENANT_IDS");
        assert_eq!(
            out,
            vec!["acme", "globex", "hello"],
            "splits + trims + drops empty"
        );
        std::env::set_var("SCHEDULER_TENANT_IDS", "");
        let out = tenants_from_env();
        std::env::remove_var("SCHEDULER_TENANT_IDS");
        assert_eq!(
            out,
            vec![DEFAULT_SCHEDULER_TENANT_ID.to_string()],
            "empty → default"
        );
    }
 }
 /// Resolve the per-tenant database. Logs and returns `None` on failure
 /// so the loop in the caller can continue with other tenants.
 async fn tenant_db(agent: &ComplianceAgent, tenant_id: &str) -> Option<Database> {
@@ -66,8 +66,6 @@ impl TestServer {
            pentest_imap_tls: false,
            pentest_imap_username: None,
            pentest_imap_password: None,
            admin_api_token: None,
            tenant_registry_url: None,
        };
        let agent = ComplianceAgent::new(config, db_pool);
@@ -63,24 +63,16 @@ struct Claims {
 const PUBLIC_ENDPOINTS: &[&str] = &["/api/v1/health"];
 /// Path prefixes that bypass JWT validation. The admin sub-router
 /// (`/api/v1/admin/*`) has its own static-bearer middleware and must
 /// not be routed through the customer-JWT path — a Keycloak token
 /// always carries a single tenant_id and would semantically conflict
 /// with cross-tenant admin operations.
 const PUBLIC_PREFIXES: &[&str] = &["/api/v1/admin/"];
 /// Middleware that validates Bearer JWT tokens against Keycloak's JWKS
 /// and attaches a `TenantContext` extension on success.
 ///
-/// Skips validation for the health endpoint and any path under one of
+/// Skips validation for the health endpoint.
-/// the [`PUBLIC_PREFIXES`]. If `JwksState` is not present (Keycloak
+/// If `JwksState` is not present (Keycloak not configured), requests
-/// not configured), requests pass through and downstream code must
+/// pass through and downstream code must handle the missing context.
 /// handle the missing context.
 pub async fn require_jwt_auth(mut request: Request, next: Next) -> Response {
    let path = request.uri().path();
-    if PUBLIC_ENDPOINTS.contains(&path) || PUBLIC_PREFIXES.iter().any(|p| path.starts_with(p)) {
+    if PUBLIC_ENDPOINTS.contains(&path) {
        return next.run(request).await;
    }
@@ -37,15 +37,6 @@ pub struct AgentConfig {
    pub pentest_imap_tls: bool,
    pub pentest_imap_username: Option<String>,
    pub pentest_imap_password: Option<SecretString>,
    /// Static bearer for the cross-tenant admin endpoints under
    /// `/api/v1/admin/*`. When `None`, those endpoints are not
    /// mounted at all (defense-in-depth: ops endpoints never reach
    /// any auth path if no operator has explicitly opted in).
    pub admin_api_token: Option<SecretString>,
    /// Live tenant-registry URL the scheduler consults for the list
    /// of tenants to iterate. When `None` or unreachable, scheduler
    /// falls back to `SCHEDULER_TENANT_IDS` env (M7.2-C).
    pub tenant_registry_url: Option<String>,
 }
 #[derive(Clone, Debug, Serialize, Deserialize)]