Files
compliance-scanner-agent/compliance-core/src/auth.rs
T
Sharang Parnerkar f474699279
CI / Check (pull_request) Successful in 8m17s
CI / Detect Changes (pull_request) Has been skipped
CI / Deploy Agent (pull_request) Has been skipped
CI / Deploy Dashboard (pull_request) Has been skipped
CI / Deploy Docs (pull_request) Has been skipped
CI / Deploy MCP (pull_request) Has been skipped
fix(core): JWKS refresh-on-failure in M7.1 auth middleware
Without this, every Keycloak signing-key rotation produces a silent
401 storm against every request until the agent restarts — the cached
JWKS is held forever and never reconciled against KC.

Now: when `kid` isn't in the cached JWKS or the matching key fails
signature verification, we classify the failure as Stale, force a JWKS
refresh, and retry once. Anything else (expired, malformed, missing
tenant_id) is Permanent and short-circuits straight to 401.

* Splits the path into a pure `try_validate(token, header, kid, jwks)`
  helper returning a `ValidationError { Stale | Permanent }` enum.
* `fetch_or_get_jwks(state, force)` takes a force flag and holds the
  write lock across the network fetch so concurrent refreshers don't
  all hammer Keycloak when keys rotate (the second writer reuses what
  the first put in cache).
* Adds a unit test for the kid-not-found Stale classification.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-04 16:40:55 +02:00

391 lines
13 KiB
Rust

//! M7.1 — JWT validation + tenant context propagation.
//!
//! `require_jwt_auth` validates a Bearer JWT against Keycloak's JWKS and
//! attaches a [`TenantContext`] to the request extensions. Downstream
//! middleware ([`require_tenant_status`]) and Axum extractors
//! ([`crate::tenant_ctx::TenantCtx`]) read it from there.
//!
//! Skipped paths:
//! * `/api/v1/health` — Kubernetes liveness; never authenticated.
//!
//! Failure modes:
//! * No `JwksState` extension → pass-through (single-tenant dev mode).
//! * Missing / malformed Bearer header → 401.
//! * Signature / expiry invalid → 401.
//! * Claims present but tenant_id missing → 401 (treated as a malformed
//! token; the realm must always issue tenant_id).
use std::sync::Arc;
use axum::{
extract::Request,
http::Method,
middleware::Next,
response::{IntoResponse, Response},
};
use jsonwebtoken::{decode, decode_header, jwk::JwkSet, DecodingKey, Validation};
use reqwest::StatusCode;
use serde::Deserialize;
use tokio::sync::RwLock;
use crate::{OrgRole, TenantContext, TenantStatus};
/// Cached JWKS from Keycloak for token validation.
#[derive(Clone)]
pub struct JwksState {
pub jwks: Arc<RwLock<Option<JwkSet>>>,
pub jwks_url: String,
}
/// Raw shape of the JWT payload — matches the breakpilot-dev realm's
/// protocol-mapper output. Missing fields default to "" / empty so a
/// realm that hasn't been fully wired yet still validates.
#[derive(Debug, Deserialize)]
struct Claims {
sub: String,
#[serde(default)]
name: Option<String>,
#[serde(default)]
preferred_username: Option<String>,
#[serde(default)]
tenant_id: String,
#[serde(default)]
tenant_slug: String,
#[serde(default)]
org_roles: Vec<String>,
#[serde(default)]
products: Vec<String>,
#[serde(default)]
plan: String,
#[serde(default)]
tenant_status: Option<TenantStatus>,
}
const PUBLIC_ENDPOINTS: &[&str] = &["/api/v1/health"];
/// Middleware that validates Bearer JWT tokens against Keycloak's JWKS
/// and attaches a `TenantContext` extension on success.
///
/// Skips validation for the health endpoint.
/// If `JwksState` is not present (Keycloak not configured), requests
/// pass through and downstream code must handle the missing context.
pub async fn require_jwt_auth(mut request: Request, next: Next) -> Response {
let path = request.uri().path();
if PUBLIC_ENDPOINTS.contains(&path) {
return next.run(request).await;
}
let jwks_state = match request.extensions().get::<JwksState>() {
Some(s) => s.clone(),
None => return next.run(request).await,
};
let auth_header = match request.headers().get("authorization") {
Some(h) => h,
None => return (StatusCode::UNAUTHORIZED, "Missing authorization header").into_response(),
};
let token = match auth_header.to_str() {
Ok(s) if s.starts_with("Bearer ") => &s[7..],
_ => return (StatusCode::UNAUTHORIZED, "Invalid authorization header").into_response(),
};
match validate_token(token, &jwks_state).await {
Ok(ctx) => {
request.extensions_mut().insert(ctx);
next.run(request).await
}
Err(e) => {
tracing::warn!("JWT validation failed: {e}");
(StatusCode::UNAUTHORIZED, "Invalid token").into_response()
}
}
}
/// Middleware that enforces the M7.1 `tenant_status` contract.
///
/// * `Active` / `Trial` / `Demo` — pass through.
/// * `Frozen` — read-only after cancel / non-payment. Writes return 402.
/// * `Archived` — data-retention window closed. Every request returns 410.
///
/// Pass-through when no `TenantContext` is present (single-tenant dev or
/// the upstream JWT middleware ran without `JwksState`).
pub async fn require_tenant_status(request: Request, next: Next) -> Response {
let ctx = match request.extensions().get::<TenantContext>() {
Some(c) => c.clone(),
None => return next.run(request).await,
};
if ctx.status.is_archived() {
return (
StatusCode::GONE,
"Tenant archived — data retention window closed",
)
.into_response();
}
if ctx.status.is_frozen() && is_write(request.method()) {
return (
StatusCode::PAYMENT_REQUIRED,
"Tenant frozen — read-only. Re-activate to resume writes.",
)
.into_response();
}
next.run(request).await
}
/// Treat anything other than GET/HEAD/OPTIONS as a write. Good enough for
/// REST. The few exceptions (e.g. read-side POSTs) can opt out at the
/// handler level once we have them.
fn is_write(m: &Method) -> bool {
!matches!(m, &Method::GET | &Method::HEAD | &Method::OPTIONS)
}
async fn validate_token(token: &str, state: &JwksState) -> Result<TenantContext, String> {
let header = decode_header(token).map_err(|e| format!("failed to decode JWT header: {e}"))?;
let kid = header
.kid
.clone()
.ok_or_else(|| "JWT missing kid header".to_string())?;
// First try against whatever's currently cached. If the kid isn't
// there or the signature doesn't verify, the cached JWKS is most
// likely stale (KC rotated keys) — refresh once and retry before
// giving up. Without this every key rotation produces a silent 401
// storm that only goes away when the agent restarts.
let jwks = fetch_or_get_jwks(state, false).await?;
match try_validate(token, &header, &kid, &jwks) {
Ok(ctx) => Ok(ctx),
Err(ValidationError::Permanent(e)) => Err(e),
Err(ValidationError::Stale(reason)) => {
tracing::info!(
kid = %kid,
reason = %reason,
"JWKS appears stale — forcing refresh and retrying"
);
let jwks = fetch_or_get_jwks(state, true).await?;
try_validate(token, &header, &kid, &jwks).map_err(|e| match e {
ValidationError::Stale(s) | ValidationError::Permanent(s) => s,
})
}
}
}
#[derive(Debug)]
enum ValidationError {
/// Refresh-eligible: cached JWKS may be stale.
Stale(String),
/// Refusing the token regardless of JWKS freshness.
Permanent(String),
}
fn try_validate(
token: &str,
header: &jsonwebtoken::Header,
kid: &str,
jwks: &JwkSet,
) -> Result<TenantContext, ValidationError> {
let jwk = match jwks
.keys
.iter()
.find(|k| k.common.key_id.as_deref() == Some(kid))
{
Some(j) => j,
None => {
return Err(ValidationError::Stale(
"no matching key found in JWKS".to_string(),
))
}
};
let decoding_key = DecodingKey::from_jwk(jwk)
.map_err(|e| ValidationError::Permanent(format!("failed to create decoding key: {e}")))?;
let mut validation = Validation::new(header.alg);
validation.validate_exp = true;
validation.validate_aud = false;
let data = match decode::<Claims>(token, &decoding_key, &validation) {
Ok(d) => d,
Err(e) => {
// Signature mismatch is the other refresh-eligible failure:
// the matching kid is present but the key bytes don't match.
// Everything else (expired, malformed, etc.) is permanent.
return Err(
if matches!(e.kind(), jsonwebtoken::errors::ErrorKind::InvalidSignature) {
ValidationError::Stale(format!("token validation failed: {e}"))
} else {
ValidationError::Permanent(format!("token validation failed: {e}"))
},
);
}
};
claims_to_context(data.claims).map_err(ValidationError::Permanent)
}
/// Map the decoded JWT payload into the platform-wide `TenantContext`.
/// Pulled out for unit testing — no I/O.
fn claims_to_context(c: Claims) -> Result<TenantContext, String> {
if c.tenant_id.is_empty() {
return Err("JWT is missing tenant_id claim".to_string());
}
let status = c.tenant_status.unwrap_or_else(|| {
tracing::warn!(
"JWT missing tenant_status claim for tenant {} — defaulting to Trial",
c.tenant_id
);
TenantStatus::Trial
});
Ok(TenantContext {
tenant_id: c.tenant_id,
tenant_slug: c.tenant_slug,
org_roles: c.org_roles.iter().map(|r| OrgRole::parse(r)).collect(),
products: c.products,
plan: c.plan,
status,
user_id: c.sub,
user_name: c.name.or(c.preferred_username),
})
}
async fn fetch_or_get_jwks(state: &JwksState, force: bool) -> Result<JwkSet, String> {
if !force {
let cached = state.jwks.read().await;
if let Some(ref jwks) = *cached {
return Ok(jwks.clone());
}
}
// Hold the write lock across the fetch so concurrent refreshers
// don't all hammer Keycloak when keys rotate. If another writer
// already populated a fresh JWKS while we were waiting (and we
// weren't asked to force), use theirs.
let mut cached = state.jwks.write().await;
if !force {
if let Some(ref jwks) = *cached {
return Ok(jwks.clone());
}
}
let resp = reqwest::get(&state.jwks_url)
.await
.map_err(|e| format!("failed to fetch JWKS: {e}"))?;
let jwks: JwkSet = resp
.json()
.await
.map_err(|e| format!("failed to parse JWKS: {e}"))?;
*cached = Some(jwks.clone());
Ok(jwks)
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used)]
mod tests {
use super::*;
fn base_claims() -> Claims {
Claims {
sub: "user-123".to_string(),
name: Some("Alice Acme".to_string()),
preferred_username: None,
tenant_id: "00000000-0000-0000-0000-000000000001".to_string(),
tenant_slug: "acme".to_string(),
org_roles: vec!["IT_ADMIN".to_string()],
products: vec!["compliance".to_string()],
plan: "professional".to_string(),
tenant_status: Some(TenantStatus::Active),
}
}
#[test]
fn claims_to_context_happy_path() {
let ctx = claims_to_context(base_claims()).expect("should map");
assert_eq!(ctx.tenant_id, "00000000-0000-0000-0000-000000000001");
assert_eq!(ctx.tenant_slug, "acme");
assert_eq!(ctx.org_roles, vec![OrgRole::ItAdmin]);
assert_eq!(ctx.products, vec!["compliance"]);
assert_eq!(ctx.plan, "professional");
assert_eq!(ctx.status, TenantStatus::Active);
assert_eq!(ctx.user_id, "user-123");
assert_eq!(ctx.user_name.as_deref(), Some("Alice Acme"));
}
#[test]
fn claims_to_context_rejects_missing_tenant_id() {
let mut c = base_claims();
c.tenant_id = "".to_string();
let err = claims_to_context(c).expect_err("should reject");
assert!(err.contains("tenant_id"));
}
#[test]
fn claims_to_context_defaults_status_when_missing() {
let mut c = base_claims();
c.tenant_status = None;
let ctx = claims_to_context(c).expect("should map");
assert_eq!(ctx.status, TenantStatus::Trial);
}
#[test]
fn claims_to_context_falls_back_to_preferred_username() {
let mut c = base_claims();
c.name = None;
c.preferred_username = Some("alice@acme.dev".to_string());
let ctx = claims_to_context(c).expect("should map");
assert_eq!(ctx.user_name.as_deref(), Some("alice@acme.dev"));
}
#[test]
fn claims_to_context_parses_multiple_roles() {
let mut c = base_claims();
c.org_roles = vec![
"IT_ADMIN".to_string(),
"CXO".to_string(),
"GARBAGE".to_string(),
];
let ctx = claims_to_context(c).expect("should map");
assert_eq!(
ctx.org_roles,
vec![OrgRole::ItAdmin, OrgRole::Cxo, OrgRole::Unknown]
);
}
#[test]
fn try_validate_returns_stale_when_kid_missing_from_jwks() {
// Empty JWKS — the kid we ask for can't possibly match. The error
// must classify as Stale so the caller refreshes JWKS and retries.
let jwks = JwkSet { keys: vec![] };
let header = jsonwebtoken::Header {
alg: jsonwebtoken::Algorithm::RS256,
kid: Some("kid-rotated-out".to_string()),
..Default::default()
};
let err = try_validate("ignored.token.value", &header, "kid-rotated-out", &jwks)
.expect_err("should fail");
match err {
ValidationError::Stale(s) => assert!(s.contains("no matching key")),
ValidationError::Permanent(s) => panic!("must be Stale, got Permanent: {s}"),
}
}
#[test]
fn is_write_detects_methods() {
assert!(!is_write(&Method::GET));
assert!(!is_write(&Method::HEAD));
assert!(!is_write(&Method::OPTIONS));
assert!(is_write(&Method::POST));
assert!(is_write(&Method::PUT));
assert!(is_write(&Method::PATCH));
assert!(is_write(&Method::DELETE));
}
}