merge: sync with origin/main, take upstream on conflicts

# Conflicts:
#	admin-compliance/lib/sdk/types.ts
#	admin-compliance/lib/sdk/vendor-compliance/types.ts
This commit is contained in:
Sharang Parnerkar
2026-04-16 16:26:48 +02:00
352 changed files with 181673 additions and 2188 deletions

View File

@@ -6,6 +6,8 @@ from .routes import router
logger = logging.getLogger(__name__)
_failed_routers: dict[str, str] = {}
def _safe_import_router(module_name: str, attr: str = "router"):
"""Import a router module safely — log error but don't crash the whole app."""
@@ -14,6 +16,7 @@ def _safe_import_router(module_name: str, attr: str = "router"):
return getattr(mod, attr)
except Exception as e:
logger.error("Failed to import %s: %s", module_name, e)
_failed_routers[module_name] = str(e)
return None
@@ -53,6 +56,13 @@ _ROUTER_MODULES = [
"wiki_routes",
"canonical_control_routes",
"control_generator_routes",
"crosswalk_routes",
"process_task_routes",
"evidence_check_routes",
"vvt_library_routes",
"tom_mapping_routes",
"llm_audit_routes",
"assertion_routes",
]
_loaded_count = 0

View File

@@ -0,0 +1,227 @@
"""
API routes for Assertion Engine (Anti-Fake-Evidence Phase 2).
Endpoints:
- /assertions: CRUD for assertions
- /assertions/extract: Automatic extraction from entity text
- /assertions/summary: Stats (total assertions, facts, unverified)
"""
import logging
from datetime import datetime
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from ..db.models import AssertionDB
from ..services.assertion_engine import extract_assertions
from .schemas import (
AssertionCreate,
AssertionUpdate,
AssertionResponse,
AssertionListResponse,
AssertionSummaryResponse,
AssertionExtractRequest,
)
from .audit_trail_utils import log_audit_trail, generate_id
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-assertions"])
def _build_assertion_response(a: AssertionDB) -> AssertionResponse:
return AssertionResponse(
id=a.id,
tenant_id=a.tenant_id,
entity_type=a.entity_type,
entity_id=a.entity_id,
sentence_text=a.sentence_text,
sentence_index=a.sentence_index,
assertion_type=a.assertion_type,
evidence_ids=a.evidence_ids or [],
confidence=a.confidence or 0.0,
normative_tier=a.normative_tier,
verified_by=a.verified_by,
verified_at=a.verified_at,
created_at=a.created_at,
updated_at=a.updated_at,
)
@router.post("/assertions", response_model=AssertionResponse)
async def create_assertion(
data: AssertionCreate,
tenant_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
"""Create a single assertion manually."""
a = AssertionDB(
id=generate_id(),
tenant_id=tenant_id,
entity_type=data.entity_type,
entity_id=data.entity_id,
sentence_text=data.sentence_text,
assertion_type=data.assertion_type or "assertion",
evidence_ids=data.evidence_ids or [],
normative_tier=data.normative_tier,
)
db.add(a)
db.commit()
db.refresh(a)
return _build_assertion_response(a)
@router.get("/assertions", response_model=AssertionListResponse)
async def list_assertions(
entity_type: Optional[str] = Query(None),
entity_id: Optional[str] = Query(None),
assertion_type: Optional[str] = Query(None),
tenant_id: Optional[str] = Query(None),
limit: int = Query(100, ge=1, le=500),
db: Session = Depends(get_db),
):
"""List assertions with optional filters."""
query = db.query(AssertionDB)
if entity_type:
query = query.filter(AssertionDB.entity_type == entity_type)
if entity_id:
query = query.filter(AssertionDB.entity_id == entity_id)
if assertion_type:
query = query.filter(AssertionDB.assertion_type == assertion_type)
if tenant_id:
query = query.filter(AssertionDB.tenant_id == tenant_id)
total = query.count()
records = query.order_by(AssertionDB.sentence_index.asc()).limit(limit).all()
return AssertionListResponse(
assertions=[_build_assertion_response(a) for a in records],
total=total,
)
@router.get("/assertions/summary", response_model=AssertionSummaryResponse)
async def assertion_summary(
tenant_id: Optional[str] = Query(None),
entity_type: Optional[str] = Query(None),
entity_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
"""Summary stats: total assertions, facts, rationale, unverified."""
query = db.query(AssertionDB)
if tenant_id:
query = query.filter(AssertionDB.tenant_id == tenant_id)
if entity_type:
query = query.filter(AssertionDB.entity_type == entity_type)
if entity_id:
query = query.filter(AssertionDB.entity_id == entity_id)
all_records = query.all()
total = len(all_records)
facts = sum(1 for a in all_records if a.assertion_type == "fact")
rationale = sum(1 for a in all_records if a.assertion_type == "rationale")
unverified = sum(1 for a in all_records if a.assertion_type == "assertion" and not a.verified_by)
return AssertionSummaryResponse(
total_assertions=total,
total_facts=facts,
total_rationale=rationale,
unverified_count=unverified,
)
@router.get("/assertions/{assertion_id}", response_model=AssertionResponse)
async def get_assertion(
assertion_id: str,
db: Session = Depends(get_db),
):
"""Get a single assertion by ID."""
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
if not a:
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
return _build_assertion_response(a)
@router.put("/assertions/{assertion_id}", response_model=AssertionResponse)
async def update_assertion(
assertion_id: str,
data: AssertionUpdate,
db: Session = Depends(get_db),
):
"""Update an assertion (e.g. link evidence, change type)."""
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
if not a:
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
update_fields = data.model_dump(exclude_unset=True)
for key, value in update_fields.items():
setattr(a, key, value)
a.updated_at = datetime.utcnow()
db.commit()
db.refresh(a)
return _build_assertion_response(a)
@router.post("/assertions/{assertion_id}/verify", response_model=AssertionResponse)
async def verify_assertion(
assertion_id: str,
verified_by: str = Query(...),
db: Session = Depends(get_db),
):
"""Mark an assertion as verified fact."""
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
if not a:
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
a.assertion_type = "fact"
a.verified_by = verified_by
a.verified_at = datetime.utcnow()
a.updated_at = datetime.utcnow()
db.commit()
db.refresh(a)
return _build_assertion_response(a)
@router.post("/assertions/extract", response_model=AssertionListResponse)
async def extract_assertions_endpoint(
data: AssertionExtractRequest,
tenant_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
"""Extract assertions from free text and persist them."""
extracted = extract_assertions(
text=data.text,
entity_type=data.entity_type,
entity_id=data.entity_id,
tenant_id=tenant_id,
)
created = []
for item in extracted:
a = AssertionDB(
id=generate_id(),
tenant_id=item["tenant_id"],
entity_type=item["entity_type"],
entity_id=item["entity_id"],
sentence_text=item["sentence_text"],
sentence_index=item["sentence_index"],
assertion_type=item["assertion_type"],
evidence_ids=item["evidence_ids"],
normative_tier=item.get("normative_tier"),
confidence=item.get("confidence", 0.0),
)
db.add(a)
created.append(a)
db.commit()
for a in created:
db.refresh(a)
return AssertionListResponse(
assertions=[_build_assertion_response(a) for a in created],
total=len(created),
)

View File

@@ -0,0 +1,53 @@
"""Shared audit trail utilities.
Extracted from isms_routes.py for reuse across evidence, control,
and assertion routes.
"""
import hashlib
import uuid
from datetime import datetime
from sqlalchemy.orm import Session
from ..db.models import AuditTrailDB
def generate_id() -> str:
"""Generate a UUID string."""
return str(uuid.uuid4())
def create_signature(data: str) -> str:
"""Create SHA-256 signature."""
return hashlib.sha256(data.encode()).hexdigest()
def log_audit_trail(
db: Session,
entity_type: str,
entity_id: str,
entity_name: str,
action: str,
performed_by: str,
field_changed: str = None,
old_value: str = None,
new_value: str = None,
change_summary: str = None,
):
"""Log an entry to the audit trail."""
trail = AuditTrailDB(
id=generate_id(),
entity_type=entity_type,
entity_id=entity_id,
entity_name=entity_name,
action=action,
field_changed=field_changed,
old_value=old_value,
new_value=new_value,
change_summary=change_summary,
performed_by=performed_by,
performed_at=datetime.utcnow(),
checksum=create_signature(f"{entity_type}|{entity_id}|{action}|{performed_by}"),
)
db.add(trail)

File diff suppressed because it is too large Load Diff

View File

@@ -12,6 +12,7 @@ Endpoints:
POST /v1/canonical/blocked-sources/cleanup — Start cleanup workflow
"""
import asyncio
import json
import logging
from typing import Optional, List
@@ -25,7 +26,16 @@ from compliance.services.control_generator import (
ControlGeneratorPipeline,
GeneratorConfig,
ALL_COLLECTIONS,
VALID_CATEGORIES,
VALID_DOMAINS,
_classify_regulation,
_detect_category,
_detect_domain,
_llm_local,
_parse_llm_json,
CATEGORY_LIST_STR,
)
from compliance.services.citation_backfill import CitationBackfill, BackfillResult
from compliance.services.rag_client import get_rag_client
logger = logging.getLogger(__name__)
@@ -40,9 +50,12 @@ class GenerateRequest(BaseModel):
domain: Optional[str] = None
collections: Optional[List[str]] = None
max_controls: int = 50
max_chunks: int = 1000 # Default: process max 1000 chunks per job (respects document boundaries)
batch_size: int = 5
skip_web_search: bool = False
dry_run: bool = False
regulation_filter: Optional[List[str]] = None # Only process these regulation_code prefixes
skip_prefilter: bool = False # Skip local LLM pre-filter, send all chunks to API
class GenerateResponse(BaseModel):
@@ -55,6 +68,7 @@ class GenerateResponse(BaseModel):
controls_needs_review: int = 0
controls_too_close: int = 0
controls_duplicates_found: int = 0
controls_qa_fixed: int = 0
errors: list = []
controls: list = []
@@ -89,42 +103,111 @@ class BlockedSourceResponse(BaseModel):
# ENDPOINTS
# =============================================================================
async def _run_pipeline_background(config: GeneratorConfig, job_id: str):
"""Run the pipeline in the background. Uses its own DB session."""
db = SessionLocal()
try:
config.existing_job_id = job_id
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
result = await pipeline.run(config)
logger.info(
"Background generation job %s completed: %d controls from %d chunks",
job_id, result.controls_generated, result.total_chunks_scanned,
)
except Exception as e:
logger.error("Background generation job %s failed: %s", job_id, e)
# Update job as failed
try:
db.execute(
text("""
UPDATE canonical_generation_jobs
SET status = 'failed', errors = :errors, completed_at = NOW()
WHERE id = CAST(:job_id AS uuid)
"""),
{"job_id": job_id, "errors": json.dumps([str(e)])},
)
db.commit()
except Exception:
pass
finally:
db.close()
@router.post("/generate", response_model=GenerateResponse)
async def start_generation(req: GenerateRequest):
"""Start a control generation run."""
"""Start a control generation run (runs in background).
Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress.
"""
config = GeneratorConfig(
collections=req.collections,
domain=req.domain,
batch_size=req.batch_size,
max_controls=req.max_controls,
max_chunks=req.max_chunks,
skip_web_search=req.skip_web_search,
dry_run=req.dry_run,
regulation_filter=req.regulation_filter,
skip_prefilter=req.skip_prefilter,
)
if req.dry_run:
# Dry run: execute synchronously and return controls
db = SessionLocal()
try:
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
result = await pipeline.run(config)
return GenerateResponse(
job_id=result.job_id,
status=result.status,
message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
total_chunks_scanned=result.total_chunks_scanned,
controls_generated=result.controls_generated,
controls_verified=result.controls_verified,
controls_needs_review=result.controls_needs_review,
controls_too_close=result.controls_too_close,
controls_duplicates_found=result.controls_duplicates_found,
errors=result.errors,
controls=result.controls,
)
except Exception as e:
logger.error("Dry run failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
# Create job record first so we can return the ID
db = SessionLocal()
try:
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
result = await pipeline.run(config)
return GenerateResponse(
job_id=result.job_id,
status=result.status,
message=f"Generated {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
total_chunks_scanned=result.total_chunks_scanned,
controls_generated=result.controls_generated,
controls_verified=result.controls_verified,
controls_needs_review=result.controls_needs_review,
controls_too_close=result.controls_too_close,
controls_duplicates_found=result.controls_duplicates_found,
errors=result.errors,
controls=result.controls if req.dry_run else [],
result = db.execute(
text("""
INSERT INTO canonical_generation_jobs (status, config)
VALUES ('running', :config)
RETURNING id
"""),
{"config": json.dumps(config.model_dump())},
)
db.commit()
row = result.fetchone()
job_id = str(row[0]) if row else None
except Exception as e:
logger.error("Generation failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
logger.error("Failed to create job: %s", e)
raise HTTPException(status_code=500, detail=f"Failed to create job: {e}")
finally:
db.close()
if not job_id:
raise HTTPException(status_code=500, detail="Failed to create job record")
# Launch pipeline in background
asyncio.create_task(_run_pipeline_background(config, job_id))
return GenerateResponse(
job_id=job_id,
status="running",
message="Generation started in background. Poll /generate/status/{job_id} for progress.",
)
@router.get("/generate/status/{job_id}")
async def get_job_status(job_id: str):
@@ -132,7 +215,7 @@ async def get_job_status(job_id: str):
db = SessionLocal()
try:
result = db.execute(
text("SELECT * FROM canonical_generation_jobs WHERE id = :id::uuid"),
text("SELECT * FROM canonical_generation_jobs WHERE id = CAST(:id AS uuid)"),
{"id": job_id},
)
row = result.fetchone()
@@ -270,6 +353,188 @@ async def review_control(control_id: str, req: ReviewRequest):
db.close()
class BulkReviewRequest(BaseModel):
release_state: str # Filter: which controls to bulk-review
action: str # "approve" or "reject"
new_state: Optional[str] = None # Override target state
@router.post("/generate/bulk-review")
async def bulk_review(req: BulkReviewRequest):
"""Bulk review all controls matching a release_state filter.
Example: reject all needs_review → sets them to deprecated.
"""
if req.release_state not in ("needs_review", "too_close", "duplicate"):
raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
if req.action == "approve":
target = req.new_state or "draft"
elif req.action == "reject":
target = "deprecated"
else:
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
db = SessionLocal()
try:
result = db.execute(
text("""
UPDATE canonical_controls
SET release_state = :target, updated_at = NOW()
WHERE release_state = :source
RETURNING control_id
"""),
{"source": req.release_state, "target": target},
)
affected = [row[0] for row in result]
db.commit()
return {
"action": req.action,
"source_state": req.release_state,
"target_state": target,
"affected_count": len(affected),
}
finally:
db.close()
class QAReclassifyRequest(BaseModel):
limit: int = 100 # How many controls to reclassify per run
dry_run: bool = True # Preview only by default
filter_category: Optional[str] = None # Only reclassify controls of this category
filter_domain_prefix: Optional[str] = None # Only reclassify controls with this prefix
@router.post("/generate/qa-reclassify")
async def qa_reclassify(req: QAReclassifyRequest):
"""Run QA reclassification on existing controls using local LLM.
Finds controls where keyword-detection disagrees with current category/domain,
then uses Ollama to determine the correct classification.
"""
db = SessionLocal()
try:
# Load controls to check
where_clauses = ["release_state NOT IN ('deprecated')"]
params = {"limit": req.limit}
if req.filter_category:
where_clauses.append("category = :cat")
params["cat"] = req.filter_category
if req.filter_domain_prefix:
where_clauses.append("control_id LIKE :prefix")
params["prefix"] = f"{req.filter_domain_prefix}-%"
where_sql = " AND ".join(where_clauses)
rows = db.execute(
text(f"""
SELECT id, control_id, title, objective, category,
COALESCE(requirements::text, '[]') as requirements,
COALESCE(source_original_text, '') as source_text
FROM canonical_controls
WHERE {where_sql}
ORDER BY created_at DESC
LIMIT :limit
"""),
params,
).fetchall()
results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
for row in rows:
results["checked"] += 1
control_id = row[1]
title = row[2]
objective = row[3] or ""
current_category = row[4]
source_text = row[6] or objective
# Keyword detection on source text
kw_category = _detect_category(source_text) or _detect_category(objective)
kw_domain = _detect_domain(source_text)
current_prefix = control_id.split("-")[0] if "-" in control_id else ""
# Skip if keyword detection agrees with current classification
if kw_category == current_category and kw_domain == current_prefix:
continue
results["mismatches"] += 1
# Ask Ollama to arbitrate
try:
reqs_text = ""
try:
reqs = json.loads(row[5])
if isinstance(reqs, list):
reqs_text = ", ".join(str(r) for r in reqs[:3])
except Exception:
pass
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
Titel: {title[:100]}
Ziel: {objective[:200]}
Anforderungen: {reqs_text[:200]}
Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
Kategorien: {CATEGORY_LIST_STR}"""
raw = await _llm_local(prompt)
data = _parse_llm_json(raw)
if not data:
continue
qa_domain = data.get("domain", "").upper()
qa_category = data.get("category", "")
reason = data.get("reason", "")
fix_entry = {
"control_id": control_id,
"title": title[:80],
"old_category": current_category,
"old_domain": current_prefix,
"new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
"new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
"reason": reason,
}
category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
if category_changed and not req.dry_run:
db.execute(
text("""
UPDATE canonical_controls
SET category = :category, updated_at = NOW()
WHERE id = :id
"""),
{"id": row[0], "category": qa_category},
)
fix_entry["applied"] = True
else:
fix_entry["applied"] = False
results["fixes"].append(fix_entry)
except Exception as e:
results["errors"].append({"control_id": control_id, "error": str(e)})
if not req.dry_run:
db.commit()
return results
finally:
db.close()
@router.get("/generate/processed-stats")
async def get_processed_stats():
"""Get processing statistics per collection."""
@@ -429,3 +694,407 @@ async def get_controls_customer_view(
return {"controls": controls, "total": len(controls)}
finally:
db.close()
# =============================================================================
# CITATION BACKFILL
# =============================================================================
class BackfillRequest(BaseModel):
dry_run: bool = True # Default to dry_run for safety
limit: int = 0 # 0 = all controls
class BackfillResponse(BaseModel):
status: str
total_controls: int = 0
matched_hash: int = 0
matched_regex: int = 0
matched_llm: int = 0
unmatched: int = 0
updated: int = 0
errors: list = []
_backfill_status: dict = {}
async def _run_backfill_background(dry_run: bool, limit: int, backfill_id: str):
"""Run backfill in background with own DB session."""
db = SessionLocal()
try:
backfill = CitationBackfill(db=db, rag_client=get_rag_client())
result = await backfill.run(dry_run=dry_run, limit=limit)
_backfill_status[backfill_id] = {
"status": "completed",
"total_controls": result.total_controls,
"matched_hash": result.matched_hash,
"matched_regex": result.matched_regex,
"matched_llm": result.matched_llm,
"unmatched": result.unmatched,
"updated": result.updated,
"errors": result.errors[:50],
}
logger.info("Backfill %s completed: %d updated", backfill_id, result.updated)
except Exception as e:
logger.error("Backfill %s failed: %s", backfill_id, e)
_backfill_status[backfill_id] = {"status": "failed", "errors": [str(e)]}
finally:
db.close()
@router.post("/generate/backfill-citations", response_model=BackfillResponse)
async def start_backfill(req: BackfillRequest):
"""Backfill article/paragraph into existing control source_citations.
Uses 3-tier matching: hash lookup → regex parse → Ollama LLM.
Default is dry_run=True (preview only, no DB changes).
"""
import uuid
backfill_id = str(uuid.uuid4())[:8]
_backfill_status[backfill_id] = {"status": "running"}
# Always run in background (RAG index build takes minutes)
asyncio.create_task(_run_backfill_background(req.dry_run, req.limit, backfill_id))
return BackfillResponse(
status=f"running (id={backfill_id})",
)
@router.get("/generate/backfill-status/{backfill_id}")
async def get_backfill_status(backfill_id: str):
"""Get status of a backfill job."""
status = _backfill_status.get(backfill_id)
if not status:
raise HTTPException(status_code=404, detail="Backfill job not found")
return status
# =============================================================================
# DOMAIN + TARGET AUDIENCE BACKFILL
# =============================================================================
class DomainBackfillRequest(BaseModel):
dry_run: bool = True
job_id: Optional[str] = None # Only backfill controls from this job
limit: int = 0 # 0 = all
_domain_backfill_status: dict = {}
async def _run_domain_backfill(req: DomainBackfillRequest, backfill_id: str):
"""Backfill domain, category, and target_audience for existing controls using Anthropic."""
import os
import httpx
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
if not ANTHROPIC_API_KEY:
_domain_backfill_status[backfill_id] = {
"status": "failed", "error": "ANTHROPIC_API_KEY not set"
}
return
db = SessionLocal()
try:
# Find controls needing backfill
where_clauses = ["(target_audience IS NULL OR target_audience = '[]' OR target_audience = 'null')"]
params: dict = {}
if req.job_id:
where_clauses.append("generation_metadata->>'job_id' = :job_id")
params["job_id"] = req.job_id
query = f"""
SELECT id, control_id, title, objective, category, source_original_text, tags
FROM canonical_controls
WHERE {' AND '.join(where_clauses)}
ORDER BY control_id
"""
if req.limit > 0:
query += f" LIMIT {req.limit}"
result = db.execute(text(query), params)
controls = [dict(zip(result.keys(), row)) for row in result]
total = len(controls)
updated = 0
errors = []
_domain_backfill_status[backfill_id] = {
"status": "running", "total": total, "updated": 0, "errors": []
}
# Process in batches of 10
BATCH_SIZE = 10
for batch_start in range(0, total, BATCH_SIZE):
batch = controls[batch_start:batch_start + BATCH_SIZE]
entries = []
for idx, ctrl in enumerate(batch):
text_for_analysis = ctrl.get("objective") or ctrl.get("title") or ""
original = ctrl.get("source_original_text") or ""
if original:
text_for_analysis += f"\n\nQuelltext-Auszug: {original[:500]}"
entries.append(
f"--- CONTROL {idx + 1}: {ctrl['control_id']} ---\n"
f"Titel: {ctrl.get('title', '')}\n"
f"Objective: {text_for_analysis[:800]}\n"
f"Tags: {json.dumps(ctrl.get('tags', []))}"
)
prompt = f"""Analysiere die folgenden {len(batch)} Controls und bestimme fuer jedes:
1. domain: Das Fachgebiet (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
2. category: Die Kategorie (encryption, authentication, network, data_protection, logging, incident, continuity, compliance, supply_chain, physical, personnel, application, system, risk, governance, hardware, identity, public_administration, labor_law, finance, trade_regulation, environmental, health)
3. target_audience: Liste der Zielgruppen (moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
Antworte mit einem JSON-Array mit {len(batch)} Objekten. Jedes Objekt hat:
- control_index: 1-basierter Index
- domain: Fachgebiet-Kuerzel
- category: Kategorie
- target_audience: Liste der Zielgruppen
{"".join(entries)}"""
try:
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Fachgebiet und Zielgruppe. Antworte NUR mit validem JSON.",
"messages": [{"role": "user", "content": prompt}],
}
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.post(
"https://api.anthropic.com/v1/messages",
headers=headers,
json=payload,
)
if resp.status_code != 200:
errors.append(f"Anthropic API {resp.status_code} at batch {batch_start}")
continue
raw = resp.json().get("content", [{}])[0].get("text", "")
# Parse response
import re
bracket_match = re.search(r"\[.*\]", raw, re.DOTALL)
if not bracket_match:
errors.append(f"No JSON array in response at batch {batch_start}")
continue
results_list = json.loads(bracket_match.group(0))
for item in results_list:
idx = item.get("control_index", 0) - 1
if idx < 0 or idx >= len(batch):
continue
ctrl = batch[idx]
ctrl_id = str(ctrl["id"])
new_domain = item.get("domain", "")
new_category = item.get("category", "")
new_audience = item.get("target_audience", [])
if not isinstance(new_audience, list):
new_audience = []
# Build new control_id from domain if domain changed
old_prefix = ctrl["control_id"].split("-")[0] if ctrl["control_id"] else ""
new_prefix = new_domain.upper()[:4] if new_domain else old_prefix
if not req.dry_run:
update_parts = []
update_params: dict = {"ctrl_id": ctrl_id}
if new_category:
update_parts.append("category = :category")
update_params["category"] = new_category
if new_audience:
update_parts.append("target_audience = :target_audience")
update_params["target_audience"] = json.dumps(new_audience)
# Note: We do NOT rename control_ids here — that would
# break references and cause unique constraint violations.
if update_parts:
update_parts.append("updated_at = NOW()")
db.execute(
text(f"UPDATE canonical_controls SET {', '.join(update_parts)} WHERE id = CAST(:ctrl_id AS uuid)"),
update_params,
)
updated += 1
if not req.dry_run:
db.commit()
except Exception as e:
errors.append(f"Batch {batch_start}: {str(e)}")
db.rollback()
_domain_backfill_status[backfill_id] = {
"status": "running", "total": total, "updated": updated,
"progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}",
"errors": errors[-10:],
}
_domain_backfill_status[backfill_id] = {
"status": "completed", "total": total, "updated": updated,
"errors": errors[-50:],
}
logger.info("Domain backfill %s completed: %d/%d updated", backfill_id, updated, total)
except Exception as e:
logger.error("Domain backfill %s failed: %s", backfill_id, e)
_domain_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
finally:
db.close()
@router.post("/generate/backfill-domain")
async def start_domain_backfill(req: DomainBackfillRequest):
"""Backfill domain, category, and target_audience for controls using Anthropic API.
Finds controls where target_audience is NULL and enriches them.
Default is dry_run=True (preview only).
"""
import uuid
backfill_id = str(uuid.uuid4())[:8]
_domain_backfill_status[backfill_id] = {"status": "starting"}
asyncio.create_task(_run_domain_backfill(req, backfill_id))
return {"status": "running", "backfill_id": backfill_id,
"message": f"Domain backfill started. Poll /generate/backfill-status/{backfill_id}"}
@router.get("/generate/domain-backfill-status/{backfill_id}")
async def get_domain_backfill_status(backfill_id: str):
"""Get status of a domain backfill job."""
status = _domain_backfill_status.get(backfill_id)
if not status:
raise HTTPException(status_code=404, detail="Domain backfill job not found")
return status
# ---------------------------------------------------------------------------
# Source-Type Backfill — Classify law vs guideline vs standard vs restricted
# ---------------------------------------------------------------------------
class SourceTypeBackfillRequest(BaseModel):
dry_run: bool = True
_source_type_backfill_status: dict = {}
async def _run_source_type_backfill(dry_run: bool, backfill_id: str):
"""Backfill source_type into source_citation JSONB for all controls."""
db = SessionLocal()
try:
# Find controls with source_citation that lack source_type
rows = db.execute(text("""
SELECT control_id, source_citation, generation_metadata
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND (source_citation->>'source_type' IS NULL
OR source_citation->>'source_type' = '')
""")).fetchall()
total = len(rows)
updated = 0
already_correct = 0
errors = []
_source_type_backfill_status[backfill_id] = {
"status": "running", "total": total, "updated": 0, "dry_run": dry_run,
}
for row in rows:
cid = row[0]
citation = row[1] if isinstance(row[1], dict) else json.loads(row[1] or "{}")
metadata = row[2] if isinstance(row[2], dict) else json.loads(row[2] or "{}")
# Get regulation_code from metadata
reg_code = metadata.get("source_regulation", "")
if not reg_code:
# Try to infer from source name
errors.append(f"{cid}: no source_regulation in metadata")
continue
# Classify
license_info = _classify_regulation(reg_code)
source_type = license_info.get("source_type", "restricted")
# Update citation
citation["source_type"] = source_type
if not dry_run:
db.execute(text("""
UPDATE compliance.canonical_controls
SET source_citation = :citation
WHERE control_id = :cid
"""), {"citation": json.dumps(citation), "cid": cid})
if updated % 100 == 0:
db.commit()
updated += 1
if not dry_run:
db.commit()
# Count distribution
dist_query = db.execute(text("""
SELECT source_citation->>'source_type' as st, COUNT(*)
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND source_citation->>'source_type' IS NOT NULL
GROUP BY st
""")).fetchall() if not dry_run else []
distribution = {r[0]: r[1] for r in dist_query}
_source_type_backfill_status[backfill_id] = {
"status": "completed", "total": total, "updated": updated,
"dry_run": dry_run, "distribution": distribution,
"errors": errors[:50],
}
logger.info("Source-type backfill %s completed: %d/%d updated (dry_run=%s)",
backfill_id, updated, total, dry_run)
except Exception as e:
logger.error("Source-type backfill %s failed: %s", backfill_id, e)
_source_type_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
finally:
db.close()
@router.post("/generate/backfill-source-type")
async def start_source_type_backfill(req: SourceTypeBackfillRequest):
"""Backfill source_type (law/guideline/standard/restricted) into source_citation JSONB.
Classifies each control's source as binding law, authority guideline,
voluntary standard, or restricted norm based on regulation_code.
Default is dry_run=True (preview only).
"""
import uuid
backfill_id = str(uuid.uuid4())[:8]
_source_type_backfill_status[backfill_id] = {"status": "starting"}
asyncio.create_task(_run_source_type_backfill(req.dry_run, backfill_id))
return {
"status": "running",
"backfill_id": backfill_id,
"message": f"Source-type backfill started. Poll /generate/source-type-backfill-status/{backfill_id}",
}
@router.get("/generate/source-type-backfill-status/{backfill_id}")
async def get_source_type_backfill_status(backfill_id: str):
"""Get status of a source-type backfill job."""
status = _source_type_backfill_status.get(backfill_id)
if not status:
raise HTTPException(status_code=404, detail="Source-type backfill job not found")
return status

View File

@@ -0,0 +1,856 @@
"""
FastAPI routes for the Multi-Layer Control Architecture.
Pattern Library, Obligation Extraction, Crosswalk Matrix, and Migration endpoints.
Endpoints:
GET /v1/canonical/patterns — All patterns (with filters)
GET /v1/canonical/patterns/{pattern_id} — Single pattern
GET /v1/canonical/patterns/{pattern_id}/controls — Controls for a pattern
POST /v1/canonical/obligations/extract — Extract obligations from text
GET /v1/canonical/crosswalk — Query crosswalk matrix
GET /v1/canonical/crosswalk/stats — Coverage statistics
POST /v1/canonical/migrate/decompose — Pass 0a: Obligation extraction
POST /v1/canonical/migrate/merge-obligations — Merge implementation-level dupes
POST /v1/canonical/migrate/enrich-obligations — Add trigger_type, impl metadata
POST /v1/canonical/migrate/compose-atomic — Pass 0b: Atomic control composition
POST /v1/canonical/migrate/link-obligations — Pass 1: Obligation linkage
POST /v1/canonical/migrate/classify-patterns — Pass 2: Pattern classification
POST /v1/canonical/migrate/triage — Pass 3: Quality triage
POST /v1/canonical/migrate/backfill-crosswalk — Pass 4: Crosswalk backfill
POST /v1/canonical/migrate/deduplicate — Pass 5: Deduplication
GET /v1/canonical/migrate/status — Migration progress
GET /v1/canonical/migrate/decomposition-status — Decomposition progress
"""
import json
import logging
from typing import Optional, List
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/canonical", tags=["crosswalk"])
# =============================================================================
# REQUEST / RESPONSE MODELS
# =============================================================================
class PatternResponse(BaseModel):
id: str
name: str
name_de: str
domain: str
category: str
description: str
objective_template: str
severity_default: str
implementation_effort_default: str = "m"
tags: list = []
composable_with: list = []
open_anchor_refs: list = []
controls_count: int = 0
class PatternListResponse(BaseModel):
patterns: List[PatternResponse]
total: int
class PatternDetailResponse(PatternResponse):
rationale_template: str = ""
requirements_template: list = []
test_procedure_template: list = []
evidence_template: list = []
obligation_match_keywords: list = []
class ObligationExtractRequest(BaseModel):
text: str
regulation_code: Optional[str] = None
article: Optional[str] = None
paragraph: Optional[str] = None
class ObligationExtractResponse(BaseModel):
obligation_id: Optional[str] = None
obligation_title: Optional[str] = None
obligation_text: Optional[str] = None
method: str = "none"
confidence: float = 0.0
regulation_id: Optional[str] = None
pattern_id: Optional[str] = None
pattern_confidence: float = 0.0
class CrosswalkRow(BaseModel):
regulation_code: str = ""
article: Optional[str] = None
obligation_id: Optional[str] = None
pattern_id: Optional[str] = None
master_control_id: Optional[str] = None
confidence: float = 0.0
source: str = "auto"
class CrosswalkQueryResponse(BaseModel):
rows: List[CrosswalkRow]
total: int
class CrosswalkStatsResponse(BaseModel):
total_rows: int = 0
regulations_covered: int = 0
obligations_linked: int = 0
patterns_used: int = 0
controls_linked: int = 0
coverage_by_regulation: dict = {}
class MigrationRequest(BaseModel):
limit: int = 0 # 0 = no limit
batch_size: int = 0 # 0 = auto (5 for Anthropic, 1 for Ollama)
use_anthropic: bool = False # Use Anthropic API instead of Ollama
category_filter: Optional[str] = None # Comma-separated categories
source_filter: Optional[str] = None # Comma-separated source regulations (ILIKE match)
class BatchSubmitRequest(BaseModel):
limit: int = 0
batch_size: int = 5
category_filter: Optional[str] = None
source_filter: Optional[str] = None
class BatchProcessRequest(BaseModel):
batch_id: str
pass_type: str = "0a" # "0a" or "0b"
class MigrationResponse(BaseModel):
status: str = "completed"
stats: dict = {}
class MigrationStatusResponse(BaseModel):
total_controls: int = 0
has_obligation: int = 0
has_pattern: int = 0
fully_linked: int = 0
deprecated: int = 0
coverage_obligation_pct: float = 0.0
coverage_pattern_pct: float = 0.0
coverage_full_pct: float = 0.0
class DecompositionStatusResponse(BaseModel):
rich_controls: int = 0
decomposed_controls: int = 0
total_candidates: int = 0
validated: int = 0
rejected: int = 0
composed: int = 0
atomic_controls: int = 0
merged: int = 0
enriched: int = 0
ready_for_pass0b: int = 0
decomposition_pct: float = 0.0
composition_pct: float = 0.0
# =============================================================================
# PATTERN LIBRARY ENDPOINTS
# =============================================================================
@router.get("/patterns", response_model=PatternListResponse)
async def list_patterns(
domain: Optional[str] = Query(None, description="Filter by domain (e.g. AUTH, CRYP)"),
category: Optional[str] = Query(None, description="Filter by category"),
tag: Optional[str] = Query(None, description="Filter by tag"),
):
"""List all control patterns with optional filters."""
from compliance.services.pattern_matcher import PatternMatcher
matcher = PatternMatcher()
matcher._load_patterns()
matcher._build_keyword_index()
patterns = matcher._patterns
if domain:
patterns = [p for p in patterns if p.domain == domain.upper()]
if category:
patterns = [p for p in patterns if p.category == category.lower()]
if tag:
patterns = [p for p in patterns if tag.lower() in [t.lower() for t in p.tags]]
# Count controls per pattern from DB
control_counts = _get_pattern_control_counts()
response_patterns = []
for p in patterns:
response_patterns.append(PatternResponse(
id=p.id,
name=p.name,
name_de=p.name_de,
domain=p.domain,
category=p.category,
description=p.description,
objective_template=p.objective_template,
severity_default=p.severity_default,
implementation_effort_default=p.implementation_effort_default,
tags=p.tags,
composable_with=p.composable_with,
open_anchor_refs=p.open_anchor_refs,
controls_count=control_counts.get(p.id, 0),
))
return PatternListResponse(patterns=response_patterns, total=len(response_patterns))
@router.get("/patterns/{pattern_id}", response_model=PatternDetailResponse)
async def get_pattern(pattern_id: str):
"""Get a single control pattern by ID."""
from compliance.services.pattern_matcher import PatternMatcher
matcher = PatternMatcher()
matcher._load_patterns()
pattern = matcher.get_pattern(pattern_id)
if not pattern:
raise HTTPException(status_code=404, detail=f"Pattern {pattern_id} not found")
control_counts = _get_pattern_control_counts()
return PatternDetailResponse(
id=pattern.id,
name=pattern.name,
name_de=pattern.name_de,
domain=pattern.domain,
category=pattern.category,
description=pattern.description,
objective_template=pattern.objective_template,
rationale_template=pattern.rationale_template,
requirements_template=pattern.requirements_template,
test_procedure_template=pattern.test_procedure_template,
evidence_template=pattern.evidence_template,
severity_default=pattern.severity_default,
implementation_effort_default=pattern.implementation_effort_default,
tags=pattern.tags,
composable_with=pattern.composable_with,
open_anchor_refs=pattern.open_anchor_refs,
obligation_match_keywords=pattern.obligation_match_keywords,
controls_count=control_counts.get(pattern.id, 0),
)
@router.get("/patterns/{pattern_id}/controls")
async def get_pattern_controls(
pattern_id: str,
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
):
"""Get controls generated from a specific pattern."""
db = SessionLocal()
try:
result = db.execute(
text("""
SELECT id, control_id, title, objective, severity,
release_state, category, obligation_ids
FROM canonical_controls
WHERE pattern_id = :pattern_id
AND release_state NOT IN ('deprecated')
ORDER BY control_id
LIMIT :limit OFFSET :offset
"""),
{"pattern_id": pattern_id.upper(), "limit": limit, "offset": offset},
)
rows = result.fetchall()
count_result = db.execute(
text("""
SELECT count(*) FROM canonical_controls
WHERE pattern_id = :pattern_id
AND release_state NOT IN ('deprecated')
"""),
{"pattern_id": pattern_id.upper()},
)
total = count_result.fetchone()[0]
controls = []
for row in rows:
obl_ids = row[7]
if isinstance(obl_ids, str):
try:
obl_ids = json.loads(obl_ids)
except (json.JSONDecodeError, TypeError):
obl_ids = []
controls.append({
"id": str(row[0]),
"control_id": row[1],
"title": row[2],
"objective": row[3],
"severity": row[4],
"release_state": row[5],
"category": row[6],
"obligation_ids": obl_ids or [],
})
return {"controls": controls, "total": total}
finally:
db.close()
# =============================================================================
# OBLIGATION EXTRACTION ENDPOINT
# =============================================================================
@router.post("/obligations/extract", response_model=ObligationExtractResponse)
async def extract_obligation(req: ObligationExtractRequest):
"""Extract obligation from text using 3-tier strategy, then match to pattern."""
from compliance.services.obligation_extractor import ObligationExtractor
from compliance.services.pattern_matcher import PatternMatcher
extractor = ObligationExtractor()
await extractor.initialize()
obligation = await extractor.extract(
chunk_text=req.text,
regulation_code=req.regulation_code or "",
article=req.article,
paragraph=req.paragraph,
)
# Also match to pattern
matcher = PatternMatcher()
matcher._load_patterns()
matcher._build_keyword_index()
pattern_text = obligation.obligation_text or obligation.obligation_title or req.text[:500]
pattern_result = matcher._tier1_keyword(pattern_text, obligation.regulation_id)
return ObligationExtractResponse(
obligation_id=obligation.obligation_id,
obligation_title=obligation.obligation_title,
obligation_text=obligation.obligation_text,
method=obligation.method,
confidence=obligation.confidence,
regulation_id=obligation.regulation_id,
pattern_id=pattern_result.pattern_id if pattern_result else None,
pattern_confidence=pattern_result.confidence if pattern_result else 0,
)
# =============================================================================
# CROSSWALK MATRIX ENDPOINTS
# =============================================================================
@router.get("/crosswalk", response_model=CrosswalkQueryResponse)
async def query_crosswalk(
regulation_code: Optional[str] = Query(None),
article: Optional[str] = Query(None),
obligation_id: Optional[str] = Query(None),
pattern_id: Optional[str] = Query(None),
limit: int = Query(100, ge=1, le=1000),
offset: int = Query(0, ge=0),
):
"""Query the crosswalk matrix with filters."""
db = SessionLocal()
try:
conditions = ["1=1"]
params = {"limit": limit, "offset": offset}
if regulation_code:
conditions.append("regulation_code = :reg")
params["reg"] = regulation_code
if article:
conditions.append("article = :art")
params["art"] = article
if obligation_id:
conditions.append("obligation_id = :obl")
params["obl"] = obligation_id
if pattern_id:
conditions.append("pattern_id = :pat")
params["pat"] = pattern_id
where = " AND ".join(conditions)
result = db.execute(
text(f"""
SELECT regulation_code, article, obligation_id,
pattern_id, master_control_id, confidence, source
FROM crosswalk_matrix
WHERE {where}
ORDER BY regulation_code, article
LIMIT :limit OFFSET :offset
"""),
params,
)
rows = result.fetchall()
count_result = db.execute(
text(f"SELECT count(*) FROM crosswalk_matrix WHERE {where}"),
params,
)
total = count_result.fetchone()[0]
crosswalk_rows = [
CrosswalkRow(
regulation_code=r[0] or "",
article=r[1],
obligation_id=r[2],
pattern_id=r[3],
master_control_id=r[4],
confidence=float(r[5] or 0),
source=r[6] or "auto",
)
for r in rows
]
return CrosswalkQueryResponse(rows=crosswalk_rows, total=total)
finally:
db.close()
@router.get("/crosswalk/stats", response_model=CrosswalkStatsResponse)
async def crosswalk_stats():
"""Get crosswalk coverage statistics."""
db = SessionLocal()
try:
row = db.execute(text("""
SELECT
count(*) AS total,
count(DISTINCT regulation_code) FILTER (WHERE regulation_code != '') AS regs,
count(DISTINCT obligation_id) FILTER (WHERE obligation_id IS NOT NULL) AS obls,
count(DISTINCT pattern_id) FILTER (WHERE pattern_id IS NOT NULL) AS pats,
count(DISTINCT master_control_id) FILTER (WHERE master_control_id IS NOT NULL) AS ctrls
FROM crosswalk_matrix
""")).fetchone()
# Coverage by regulation
reg_rows = db.execute(text("""
SELECT regulation_code, count(*) AS cnt
FROM crosswalk_matrix
WHERE regulation_code != ''
GROUP BY regulation_code
ORDER BY cnt DESC
""")).fetchall()
coverage = {r[0]: r[1] for r in reg_rows}
return CrosswalkStatsResponse(
total_rows=row[0],
regulations_covered=row[1],
obligations_linked=row[2],
patterns_used=row[3],
controls_linked=row[4],
coverage_by_regulation=coverage,
)
finally:
db.close()
# =============================================================================
# MIGRATION ENDPOINTS
# =============================================================================
@router.post("/migrate/decompose", response_model=MigrationResponse)
async def migrate_decompose(req: MigrationRequest):
"""Pass 0a: Extract obligation candidates from rich controls.
With use_anthropic=true, uses Anthropic API with prompt caching
and content batching (multiple controls per API call).
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = await decomp.run_pass0a(
limit=req.limit,
batch_size=req.batch_size,
use_anthropic=req.use_anthropic,
category_filter=req.category_filter,
source_filter=req.source_filter,
)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Decomposition pass 0a failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/merge-obligations", response_model=MigrationResponse)
async def migrate_merge_obligations():
"""Merge implementation-level duplicate obligations within each parent.
Run AFTER Pass 0a, BEFORE Pass 0b. No LLM calls — rule-based.
Merges obligations that share similar action+object into the more
abstract survivor, marking the concrete duplicate as 'merged'.
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = decomp.run_merge_pass()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Merge pass failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/enrich-obligations", response_model=MigrationResponse)
async def migrate_enrich_obligations():
"""Add trigger_type and is_implementation_specific metadata.
Run AFTER merge pass, BEFORE Pass 0b. No LLM calls — rule-based.
Classifies trigger_type (event/periodic/continuous) from obligation text
and detects implementation-specific obligations (concrete tools/protocols).
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = decomp.enrich_obligations()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Enrich pass failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/compose-atomic", response_model=MigrationResponse)
async def migrate_compose_atomic(req: MigrationRequest):
"""Pass 0b: Compose atomic controls from obligation candidates.
With use_anthropic=true, uses Anthropic API with prompt caching
and content batching (multiple obligations per API call).
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = await decomp.run_pass0b(
limit=req.limit,
batch_size=req.batch_size,
use_anthropic=req.use_anthropic,
)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Decomposition pass 0b failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/batch-submit-0a", response_model=MigrationResponse)
async def batch_submit_pass0a(req: BatchSubmitRequest):
"""Submit Pass 0a as Anthropic Batch API job (50% cost reduction).
Returns a batch_id for polling. Results are processed asynchronously
within 24 hours by Anthropic.
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
result = await decomp.submit_batch_pass0a(
limit=req.limit,
batch_size=req.batch_size,
category_filter=req.category_filter,
source_filter=req.source_filter,
)
return MigrationResponse(status=result.pop("status", "submitted"), stats=result)
except Exception as e:
logger.error("Batch submit 0a failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/batch-submit-0b", response_model=MigrationResponse)
async def batch_submit_pass0b(req: BatchSubmitRequest):
"""Submit Pass 0b as Anthropic Batch API job (50% cost reduction)."""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
result = await decomp.submit_batch_pass0b(
limit=req.limit,
batch_size=req.batch_size,
)
return MigrationResponse(status=result.pop("status", "submitted"), stats=result)
except Exception as e:
logger.error("Batch submit 0b failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.get("/migrate/batch-status/{batch_id}")
async def batch_check_status(batch_id: str):
"""Check processing status of an Anthropic batch job."""
from compliance.services.decomposition_pass import check_batch_status
try:
status = await check_batch_status(batch_id)
return status
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/migrate/batch-process", response_model=MigrationResponse)
async def batch_process_results(req: BatchProcessRequest):
"""Fetch and process results from a completed Anthropic batch.
Call this after batch-status shows processing_status='ended'.
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = await decomp.process_batch_results(
batch_id=req.batch_id,
pass_type=req.pass_type,
)
return MigrationResponse(status=stats.pop("status", "completed"), stats=stats)
except Exception as e:
logger.error("Batch process failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/link-obligations", response_model=MigrationResponse)
async def migrate_link_obligations(req: MigrationRequest):
"""Pass 1: Link controls to obligations via source_citation article."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
await migration.initialize()
stats = await migration.run_pass1_obligation_linkage(limit=req.limit)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 1 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/classify-patterns", response_model=MigrationResponse)
async def migrate_classify_patterns(req: MigrationRequest):
"""Pass 2: Classify controls into patterns via keyword matching."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
await migration.initialize()
stats = await migration.run_pass2_pattern_classification(limit=req.limit)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 2 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/triage", response_model=MigrationResponse)
async def migrate_triage():
"""Pass 3: Quality triage — categorize by linkage completeness."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
stats = migration.run_pass3_quality_triage()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 3 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/backfill-crosswalk", response_model=MigrationResponse)
async def migrate_backfill_crosswalk():
"""Pass 4: Create crosswalk rows for linked controls."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
stats = migration.run_pass4_crosswalk_backfill()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 4 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/deduplicate", response_model=MigrationResponse)
async def migrate_deduplicate():
"""Pass 5: Mark duplicate controls (same obligation + pattern)."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
stats = migration.run_pass5_deduplication()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 5 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.get("/migrate/status", response_model=MigrationStatusResponse)
async def migration_status():
"""Get overall migration progress."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
status = migration.migration_status()
return MigrationStatusResponse(**status)
except Exception as e:
logger.error("Migration status failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.get("/migrate/decomposition-status", response_model=DecompositionStatusResponse)
async def decomposition_status():
"""Get decomposition progress (Pass 0a/0b)."""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
status = decomp.decomposition_status()
return DecompositionStatusResponse(**status)
except Exception as e:
logger.error("Decomposition status failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
# =============================================================================
# BATCH DEDUP ENDPOINTS
# =============================================================================
# Module-level runner reference for status polling
_batch_dedup_runner = None
@router.post("/migrate/batch-dedup", response_model=MigrationResponse)
async def migrate_batch_dedup(
dry_run: bool = Query(False, description="Preview mode — no DB changes"),
hint_filter: Optional[str] = Query(None, description="Only process hints matching this prefix"),
):
"""Batch dedup: reduce ~85k Pass 0b controls to ~18-25k masters.
Phase 1: Groups by merge_group_hint, picks best quality master, links rest.
Phase 2: Cross-group embedding search for semantically similar masters.
"""
global _batch_dedup_runner
from compliance.services.batch_dedup_runner import BatchDedupRunner
db = SessionLocal()
try:
runner = BatchDedupRunner(db=db)
_batch_dedup_runner = runner
stats = await runner.run(dry_run=dry_run, hint_filter=hint_filter)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Batch dedup failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
_batch_dedup_runner = None
db.close()
@router.get("/migrate/batch-dedup/status")
async def batch_dedup_status():
"""Get current batch dedup progress (while running)."""
if _batch_dedup_runner is not None:
return {"running": True, **_batch_dedup_runner.get_status()}
# Not running — show DB stats
db = SessionLocal()
try:
row = db.execute(text("""
SELECT
count(*) FILTER (WHERE decomposition_method = 'pass0b') AS total_pass0b,
count(*) FILTER (WHERE decomposition_method = 'pass0b'
AND release_state = 'duplicate') AS duplicates,
count(*) FILTER (WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate'
AND release_state != 'deprecated') AS masters
FROM canonical_controls
""")).fetchone()
review_count = db.execute(text(
"SELECT count(*) FROM control_dedup_reviews WHERE review_status = 'pending'"
)).fetchone()[0]
return {
"running": False,
"total_pass0b": row[0],
"duplicates": row[1],
"masters": row[2],
"pending_reviews": review_count,
}
finally:
db.close()
# =============================================================================
# HELPERS
# =============================================================================
def _get_pattern_control_counts() -> dict[str, int]:
"""Get count of controls per pattern_id from DB."""
db = SessionLocal()
try:
result = db.execute(text("""
SELECT pattern_id, count(*) AS cnt
FROM canonical_controls
WHERE pattern_id IS NOT NULL AND pattern_id != ''
AND release_state NOT IN ('deprecated')
GROUP BY pattern_id
"""))
return {row[0]: row[1] for row in result.fetchall()}
except Exception:
return {}
finally:
db.close()

View File

@@ -5,16 +5,23 @@ Endpoints:
- /dashboard: Main compliance dashboard
- /dashboard/executive: Executive summary for managers
- /dashboard/trend: Compliance score trend over time
- /dashboard/roadmap: Prioritised controls in 4 buckets
- /dashboard/module-status: Completion status of each SDK module
- /dashboard/next-actions: Top 5 most important actions
- /dashboard/snapshot: Save / query compliance score snapshots
- /score: Quick compliance score
- /reports: Report generation
"""
import logging
from datetime import datetime, timedelta, timezone
from datetime import datetime, date, timedelta
from calendar import month_abbr
from typing import Optional
from typing import Optional, Dict, Any, List
from decimal import Decimal
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
@@ -25,15 +32,24 @@ from ..db import (
ControlRepository,
EvidenceRepository,
RiskRepository,
AssertionDB,
)
from .schemas import (
DashboardResponse,
MultiDimensionalScore,
ExecutiveDashboardResponse,
TrendDataPoint,
RiskSummary,
DeadlineItem,
TeamWorkloadItem,
TraceabilityAssertion,
TraceabilityEvidence,
TraceabilityCoverage,
TraceabilityControl,
TraceabilityMatrixResponse,
)
from .tenant_utils import get_tenant_id as _get_tenant_id
from .db_utils import row_to_dict as _row_to_dict
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-dashboard"])
@@ -86,6 +102,14 @@ async def get_dashboard(db: Session = Depends(get_db)):
# or compute from by_status dict
score = ctrl_stats.get("compliance_score", 0.0)
# Multi-dimensional score (Anti-Fake-Evidence)
try:
ms = ctrl_repo.get_multi_dimensional_score()
multi_score = MultiDimensionalScore(**ms)
except Exception as e:
logger.warning(f"Failed to compute multi-dimensional score: {e}")
multi_score = None
return DashboardResponse(
compliance_score=round(score, 1),
total_regulations=len(regulations),
@@ -98,6 +122,7 @@ async def get_dashboard(db: Session = Depends(get_db)):
total_risks=len(risks),
risks_by_level=risks_by_level,
recent_activity=[],
multi_score=multi_score,
)
@@ -116,11 +141,18 @@ async def get_compliance_score(db: Session = Depends(get_db)):
else:
score = 0
# Multi-dimensional score (Anti-Fake-Evidence)
try:
multi_score = ctrl_repo.get_multi_dimensional_score()
except Exception:
multi_score = None
return {
"score": round(score, 1),
"total_controls": total,
"passing_controls": passing,
"partial_controls": partial,
"multi_score": multi_score,
}
@@ -322,6 +354,424 @@ async def get_compliance_trend(
}
# ============================================================================
# Dashboard Extended — Roadmap, Module-Status, Next-Actions, Snapshots
# ============================================================================
# Weight map for control prioritisation
_PRIORITY_WEIGHTS = {"legal": 5, "security": 3, "best_practice": 1, "operational": 2}
# SDK module definitions → DB table used for counting completion
_MODULE_DEFS: List[Dict[str, str]] = [
{"key": "vvt", "label": "VVT", "table": "compliance_vvt_activities"},
{"key": "tom", "label": "TOM", "table": "compliance_toms"},
{"key": "dsfa", "label": "DSFA", "table": "compliance_dsfa_assessments"},
{"key": "loeschfristen", "label": "Loeschfristen", "table": "compliance_loeschfristen"},
{"key": "risks", "label": "Risiken", "table": "compliance_risks"},
{"key": "controls", "label": "Controls", "table": "compliance_controls"},
{"key": "evidence", "label": "Nachweise", "table": "compliance_evidence"},
{"key": "obligations", "label": "Pflichten", "table": "compliance_obligations"},
{"key": "incidents", "label": "Vorfaelle", "table": "compliance_notfallplan_incidents"},
{"key": "vendor", "label": "Auftragsverarbeiter", "table": "compliance_vendor_assessments"},
{"key": "legal_templates", "label": "Rechtl. Dokumente", "table": "compliance_legal_templates"},
{"key": "training", "label": "Schulungen", "table": "training_modules"},
{"key": "audit", "label": "Audit", "table": "compliance_audit_sessions"},
{"key": "security_backlog", "label": "Security-Backlog", "table": "compliance_security_backlog"},
{"key": "quality", "label": "Qualitaet", "table": "compliance_quality_items"},
]
@router.get("/dashboard/roadmap")
async def get_dashboard_roadmap(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Prioritised controls in 4 buckets: Quick Wins, Must Have, Should Have, Nice to Have."""
ctrl_repo = ControlRepository(db)
controls = ctrl_repo.get_all()
today = datetime.utcnow().date()
buckets: Dict[str, list] = {
"quick_wins": [],
"must_have": [],
"should_have": [],
"nice_to_have": [],
}
for ctrl in controls:
status = ctrl.status.value if ctrl.status else "planned"
if status == "pass":
continue # already done
weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
days_overdue = 0
if ctrl.next_review_at:
review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
days_overdue = (today - review_date).days
urgency = weight * 2 + (1 if days_overdue > 0 else 0)
item = {
"id": str(ctrl.id),
"control_id": ctrl.control_id,
"title": ctrl.title,
"status": status,
"domain": ctrl.domain.value if ctrl.domain else "unknown",
"owner": ctrl.owner,
"next_review_at": ctrl.next_review_at.isoformat() if ctrl.next_review_at else None,
"days_overdue": max(0, days_overdue),
"weight": weight,
}
if weight >= 5 and days_overdue > 0:
buckets["quick_wins"].append(item)
elif weight >= 4:
buckets["must_have"].append(item)
elif weight >= 2:
buckets["should_have"].append(item)
else:
buckets["nice_to_have"].append(item)
# Sort each bucket by urgency desc
for key in buckets:
buckets[key].sort(key=lambda x: x["days_overdue"], reverse=True)
return {
"buckets": buckets,
"counts": {k: len(v) for k, v in buckets.items()},
"generated_at": datetime.utcnow().isoformat(),
}
@router.get("/dashboard/module-status")
async def get_module_status(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Completion status for each SDK module based on DB record counts."""
modules = []
for mod in _MODULE_DEFS:
try:
row = db.execute(
text(f"SELECT COUNT(*) FROM {mod['table']} WHERE tenant_id = :tid"),
{"tid": tenant_id},
).fetchone()
count = int(row[0]) if row else 0
except Exception:
count = 0
# Simple heuristic: 0 = not started, 1-2 = in progress, 3+ = complete
if count == 0:
status = "not_started"
progress = 0
elif count < 3:
status = "in_progress"
progress = min(60, count * 30)
else:
status = "complete"
progress = 100
modules.append({
"key": mod["key"],
"label": mod["label"],
"count": count,
"status": status,
"progress": progress,
})
started = sum(1 for m in modules if m["status"] != "not_started")
complete = sum(1 for m in modules if m["status"] == "complete")
return {
"modules": modules,
"total": len(modules),
"started": started,
"complete": complete,
"overall_progress": round((complete / len(modules)) * 100, 1) if modules else 0,
}
@router.get("/dashboard/next-actions")
async def get_next_actions(
limit: int = Query(5, ge=1, le=20),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Top N most important actions sorted by urgency*impact."""
ctrl_repo = ControlRepository(db)
controls = ctrl_repo.get_all()
today = datetime.utcnow().date()
actions = []
for ctrl in controls:
status = ctrl.status.value if ctrl.status else "planned"
if status == "pass":
continue
days_overdue = 0
if ctrl.next_review_at:
review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
days_overdue = max(0, (today - review_date).days)
weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
urgency_score = weight * 10 + days_overdue
actions.append({
"id": str(ctrl.id),
"control_id": ctrl.control_id,
"title": ctrl.title,
"status": status,
"domain": ctrl.domain.value if ctrl.domain else "unknown",
"owner": ctrl.owner,
"days_overdue": days_overdue,
"urgency_score": urgency_score,
"reason": "Ueberfaellig" if days_overdue > 0 else "Offen",
})
actions.sort(key=lambda x: x["urgency_score"], reverse=True)
return {"actions": actions[:limit]}
@router.post("/dashboard/snapshot")
async def create_score_snapshot(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Save current compliance score as a historical snapshot."""
ctrl_repo = ControlRepository(db)
evidence_repo = EvidenceRepository(db)
risk_repo = RiskRepository(db)
ctrl_stats = ctrl_repo.get_statistics()
evidence_stats = evidence_repo.get_statistics()
risks = risk_repo.get_all()
total = ctrl_stats.get("total", 0)
passing = ctrl_stats.get("pass", 0)
partial = ctrl_stats.get("partial", 0)
score = round(((passing + partial * 0.5) / total) * 100, 2) if total > 0 else 0
risks_high = sum(1 for r in risks if (r.inherent_risk.value if r.inherent_risk else "low") in ("high", "critical"))
today = date.today()
row = db.execute(text("""
INSERT INTO compliance_score_snapshots (
tenant_id, score, controls_total, controls_pass, controls_partial,
evidence_total, evidence_valid, risks_total, risks_high, snapshot_date
) VALUES (
:tenant_id, :score, :controls_total, :controls_pass, :controls_partial,
:evidence_total, :evidence_valid, :risks_total, :risks_high, :snapshot_date
)
ON CONFLICT (tenant_id, project_id, snapshot_date) DO UPDATE SET
score = EXCLUDED.score,
controls_total = EXCLUDED.controls_total,
controls_pass = EXCLUDED.controls_pass,
controls_partial = EXCLUDED.controls_partial,
evidence_total = EXCLUDED.evidence_total,
evidence_valid = EXCLUDED.evidence_valid,
risks_total = EXCLUDED.risks_total,
risks_high = EXCLUDED.risks_high
RETURNING *
"""), {
"tenant_id": tenant_id,
"score": score,
"controls_total": total,
"controls_pass": passing,
"controls_partial": partial,
"evidence_total": evidence_stats.get("total", 0),
"evidence_valid": evidence_stats.get("by_status", {}).get("valid", 0),
"risks_total": len(risks),
"risks_high": risks_high,
"snapshot_date": today,
}).fetchone()
db.commit()
return _row_to_dict(row)
@router.get("/dashboard/score-history")
async def get_score_history(
months: int = Query(12, ge=1, le=36),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Get compliance score history from snapshots."""
since = date.today() - timedelta(days=months * 30)
rows = db.execute(text("""
SELECT * FROM compliance_score_snapshots
WHERE tenant_id = :tenant_id AND snapshot_date >= :since
ORDER BY snapshot_date ASC
"""), {"tenant_id": tenant_id, "since": since}).fetchall()
snapshots = []
for r in rows:
d = _row_to_dict(r)
# Convert Decimal to float for JSON
if isinstance(d.get("score"), Decimal):
d["score"] = float(d["score"])
snapshots.append(d)
return {
"snapshots": snapshots,
"total": len(snapshots),
"period_months": months,
}
# ============================================================================
# Evidence Distribution (Anti-Fake-Evidence Phase 3)
# ============================================================================
@router.get("/dashboard/evidence-distribution")
async def get_evidence_distribution(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Evidence counts by confidence level and four-eyes status."""
evidence_repo = EvidenceRepository(db)
all_evidence = evidence_repo.get_all()
by_confidence = {"E0": 0, "E1": 0, "E2": 0, "E3": 0, "E4": 0}
four_eyes_pending = 0
for e in all_evidence:
level = e.confidence_level.value if e.confidence_level else "E1"
if level in by_confidence:
by_confidence[level] += 1
if e.requires_four_eyes and e.approval_status not in ("approved", "rejected"):
four_eyes_pending += 1
return {
"by_confidence": by_confidence,
"four_eyes_pending": four_eyes_pending,
"total": len(all_evidence),
}
# ============================================================================
# Traceability Matrix (Anti-Fake-Evidence Phase 4a)
# ============================================================================
@router.get("/dashboard/traceability-matrix", response_model=TraceabilityMatrixResponse)
async def get_traceability_matrix(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""
Full traceability chain: Control → Evidence → Assertions.
Loads each entity set once, builds in-memory indices, and nests
the result so the frontend can render a matrix view.
"""
ctrl_repo = ControlRepository(db)
evidence_repo = EvidenceRepository(db)
# 1. Load all three entity sets
controls = ctrl_repo.get_all()
all_evidence = evidence_repo.get_all()
all_assertions = db.query(AssertionDB).filter(
AssertionDB.entity_type == "evidence",
).all()
# 2. Index assertions by evidence_id (entity_id)
assertions_by_evidence: Dict[str, list] = {}
for a in all_assertions:
assertions_by_evidence.setdefault(a.entity_id, []).append(a)
# 3. Index evidence by control_id
evidence_by_control: Dict[str, list] = {}
for e in all_evidence:
evidence_by_control.setdefault(str(e.control_id), []).append(e)
# 4. Build nested response
result_controls: list = []
total_controls = 0
covered_controls = 0
fully_verified = 0
for ctrl in controls:
total_controls += 1
ctrl_id = str(ctrl.id)
ctrl_evidence = evidence_by_control.get(ctrl_id, [])
nested_evidence: list = []
has_evidence = len(ctrl_evidence) > 0
has_assertions = False
all_verified = True
min_conf: Optional[str] = None
conf_order = {"E0": 0, "E1": 1, "E2": 2, "E3": 3, "E4": 4}
for e in ctrl_evidence:
ev_id = str(e.id)
ev_assertions = assertions_by_evidence.get(ev_id, [])
nested_assertions = [
TraceabilityAssertion(
id=str(a.id),
sentence_text=a.sentence_text,
assertion_type=a.assertion_type or "assertion",
confidence=a.confidence or 0.0,
verified=a.verified_by is not None,
)
for a in ev_assertions
]
if nested_assertions:
has_assertions = True
for na in nested_assertions:
if not na.verified:
all_verified = False
conf = e.confidence_level.value if e.confidence_level else "E1"
if min_conf is None or conf_order.get(conf, 1) < conf_order.get(min_conf, 1):
min_conf = conf
nested_evidence.append(TraceabilityEvidence(
id=ev_id,
title=e.title,
evidence_type=e.evidence_type,
confidence_level=conf,
status=e.status.value if e.status else "valid",
assertions=nested_assertions,
))
if not has_assertions:
all_verified = False
if has_evidence:
covered_controls += 1
if has_evidence and has_assertions and all_verified:
fully_verified += 1
coverage = TraceabilityCoverage(
has_evidence=has_evidence,
has_assertions=has_assertions,
all_assertions_verified=all_verified,
min_confidence_level=min_conf,
)
result_controls.append(TraceabilityControl(
id=ctrl_id,
control_id=ctrl.control_id,
title=ctrl.title,
status=ctrl.status.value if ctrl.status else "planned",
domain=ctrl.domain.value if ctrl.domain else "unknown",
evidence=nested_evidence,
coverage=coverage,
))
summary = {
"total_controls": total_controls,
"covered_controls": covered_controls,
"fully_verified": fully_verified,
"uncovered_controls": total_controls - covered_controls,
}
return TraceabilityMatrixResponse(controls=result_controls, summary=summary)
# ============================================================================
# Reports
# ============================================================================

View File

@@ -60,10 +60,314 @@ def get_dsfa_service(db: Session = Depends(get_db)) -> DSFAService:
return DSFAService(db)
def get_workflow_service(
db: Session = Depends(get_db),
) -> DSFAWorkflowService:
return DSFAWorkflowService(db)
# =============================================================================
# Pydantic Schemas
# =============================================================================
class DSFACreate(BaseModel):
title: str
description: str = ""
status: str = "draft"
risk_level: str = "low"
processing_activity: str = ""
data_categories: List[str] = []
recipients: List[str] = []
measures: List[str] = []
created_by: str = "system"
# Section 1
processing_description: Optional[str] = None
processing_purpose: Optional[str] = None
legal_basis: Optional[str] = None
legal_basis_details: Optional[str] = None
# Section 2
necessity_assessment: Optional[str] = None
proportionality_assessment: Optional[str] = None
data_minimization: Optional[str] = None
alternatives_considered: Optional[str] = None
retention_justification: Optional[str] = None
# Section 3
involves_ai: Optional[bool] = None
overall_risk_level: Optional[str] = None
risk_score: Optional[int] = None
# Section 6
dpo_consulted: Optional[bool] = None
dpo_name: Optional[str] = None
dpo_opinion: Optional[str] = None
dpo_approved: Optional[bool] = None
authority_consulted: Optional[bool] = None
authority_reference: Optional[str] = None
authority_decision: Optional[str] = None
# Metadata
version: Optional[int] = None
conclusion: Optional[str] = None
federal_state: Optional[str] = None
authority_resource_id: Optional[str] = None
submitted_by: Optional[str] = None
# JSONB Arrays
data_subjects: Optional[List[str]] = None
affected_rights: Optional[List[str]] = None
triggered_rule_codes: Optional[List[str]] = None
ai_trigger_ids: Optional[List[str]] = None
wp248_criteria_met: Optional[List[str]] = None
art35_abs3_triggered: Optional[List[str]] = None
tom_references: Optional[List[str]] = None
risks: Optional[List[dict]] = None
mitigations: Optional[List[dict]] = None
stakeholder_consultations: Optional[List[dict]] = None
review_triggers: Optional[List[dict]] = None
review_comments: Optional[List[dict]] = None
ai_use_case_modules: Optional[List[dict]] = None
section_8_complete: Optional[bool] = None
# JSONB Objects
threshold_analysis: Optional[dict] = None
consultation_requirement: Optional[dict] = None
review_schedule: Optional[dict] = None
section_progress: Optional[dict] = None
metadata: Optional[dict] = None
class DSFAUpdate(BaseModel):
title: Optional[str] = None
description: Optional[str] = None
status: Optional[str] = None
risk_level: Optional[str] = None
processing_activity: Optional[str] = None
data_categories: Optional[List[str]] = None
recipients: Optional[List[str]] = None
measures: Optional[List[str]] = None
approved_by: Optional[str] = None
# Section 1
processing_description: Optional[str] = None
processing_purpose: Optional[str] = None
legal_basis: Optional[str] = None
legal_basis_details: Optional[str] = None
# Section 2
necessity_assessment: Optional[str] = None
proportionality_assessment: Optional[str] = None
data_minimization: Optional[str] = None
alternatives_considered: Optional[str] = None
retention_justification: Optional[str] = None
# Section 3
involves_ai: Optional[bool] = None
overall_risk_level: Optional[str] = None
risk_score: Optional[int] = None
# Section 6
dpo_consulted: Optional[bool] = None
dpo_name: Optional[str] = None
dpo_opinion: Optional[str] = None
dpo_approved: Optional[bool] = None
authority_consulted: Optional[bool] = None
authority_reference: Optional[str] = None
authority_decision: Optional[str] = None
# Metadata
version: Optional[int] = None
conclusion: Optional[str] = None
federal_state: Optional[str] = None
authority_resource_id: Optional[str] = None
submitted_by: Optional[str] = None
# JSONB Arrays
data_subjects: Optional[List[str]] = None
affected_rights: Optional[List[str]] = None
triggered_rule_codes: Optional[List[str]] = None
ai_trigger_ids: Optional[List[str]] = None
wp248_criteria_met: Optional[List[str]] = None
art35_abs3_triggered: Optional[List[str]] = None
tom_references: Optional[List[str]] = None
risks: Optional[List[dict]] = None
mitigations: Optional[List[dict]] = None
stakeholder_consultations: Optional[List[dict]] = None
review_triggers: Optional[List[dict]] = None
review_comments: Optional[List[dict]] = None
ai_use_case_modules: Optional[List[dict]] = None
section_8_complete: Optional[bool] = None
# JSONB Objects
threshold_analysis: Optional[dict] = None
consultation_requirement: Optional[dict] = None
review_schedule: Optional[dict] = None
section_progress: Optional[dict] = None
metadata: Optional[dict] = None
class DSFAStatusUpdate(BaseModel):
status: str
approved_by: Optional[str] = None
class DSFASectionUpdate(BaseModel):
"""Body for PUT /dsfa/{id}/sections/{section_number}."""
content: Optional[str] = None
# Allow arbitrary extra fields so the frontend can send any section-specific data
extra: Optional[dict] = None
class DSFAApproveRequest(BaseModel):
"""Body for POST /dsfa/{id}/approve."""
approved: bool
comments: Optional[str] = None
approved_by: Optional[str] = None
# =============================================================================
# Helpers
# =============================================================================
def _get_tenant_id(tenant_id: Optional[str]) -> str:
return tenant_id or DEFAULT_TENANT_ID
def _dsfa_to_response(row) -> dict:
"""Convert a DB row to a JSON-serializable dict."""
import json
# SQLAlchemy 2.0: Row objects need ._mapping for string-key access
if hasattr(row, "_mapping"):
row = row._mapping
def _parse_arr(val):
"""Parse a JSONB array field → list."""
if val is None:
return []
if isinstance(val, list):
return val
if isinstance(val, str):
try:
parsed = json.loads(val)
return parsed if isinstance(parsed, list) else []
except Exception:
return []
return val
def _parse_obj(val):
"""Parse a JSONB object field → dict."""
if val is None:
return {}
if isinstance(val, dict):
return val
if isinstance(val, str):
try:
parsed = json.loads(val)
return parsed if isinstance(parsed, dict) else {}
except Exception:
return {}
return val
def _ts(val):
"""Timestamp → ISO string or None."""
if not val:
return None
if isinstance(val, str):
return val
return val.isoformat()
def _get(key, default=None):
"""Safe row access — returns default if key missing (handles old rows)."""
try:
v = row[key]
return default if v is None and default is not None else v
except (KeyError, IndexError):
return default
return {
# Core fields (always present since Migration 024)
"id": str(row["id"]),
"tenant_id": row["tenant_id"],
"title": row["title"],
"description": row["description"] or "",
"status": row["status"] or "draft",
"risk_level": row["risk_level"] or "low",
"processing_activity": row["processing_activity"] or "",
"data_categories": _parse_arr(row["data_categories"]),
"recipients": _parse_arr(row["recipients"]),
"measures": _parse_arr(row["measures"]),
"approved_by": row["approved_by"],
"approved_at": _ts(row["approved_at"]),
"created_by": row["created_by"] or "system",
"created_at": _ts(row["created_at"]),
"updated_at": _ts(row["updated_at"]),
# Section 1 (Migration 030)
"processing_description": _get("processing_description"),
"processing_purpose": _get("processing_purpose"),
"legal_basis": _get("legal_basis"),
"legal_basis_details": _get("legal_basis_details"),
# Section 2
"necessity_assessment": _get("necessity_assessment"),
"proportionality_assessment": _get("proportionality_assessment"),
"data_minimization": _get("data_minimization"),
"alternatives_considered": _get("alternatives_considered"),
"retention_justification": _get("retention_justification"),
# Section 3
"involves_ai": _get("involves_ai", False),
"overall_risk_level": _get("overall_risk_level"),
"risk_score": _get("risk_score", 0),
# Section 6
"dpo_consulted": _get("dpo_consulted", False),
"dpo_consulted_at": _ts(_get("dpo_consulted_at")),
"dpo_name": _get("dpo_name"),
"dpo_opinion": _get("dpo_opinion"),
"dpo_approved": _get("dpo_approved"),
"authority_consulted": _get("authority_consulted", False),
"authority_consulted_at": _ts(_get("authority_consulted_at")),
"authority_reference": _get("authority_reference"),
"authority_decision": _get("authority_decision"),
# Metadata / Versioning
"version": _get("version", 1),
"previous_version_id": str(_get("previous_version_id")) if _get("previous_version_id") else None,
"conclusion": _get("conclusion"),
"federal_state": _get("federal_state"),
"authority_resource_id": _get("authority_resource_id"),
"submitted_for_review_at": _ts(_get("submitted_for_review_at")),
"submitted_by": _get("submitted_by"),
# JSONB Arrays
"data_subjects": _parse_arr(_get("data_subjects")),
"affected_rights": _parse_arr(_get("affected_rights")),
"triggered_rule_codes": _parse_arr(_get("triggered_rule_codes")),
"ai_trigger_ids": _parse_arr(_get("ai_trigger_ids")),
"wp248_criteria_met": _parse_arr(_get("wp248_criteria_met")),
"art35_abs3_triggered": _parse_arr(_get("art35_abs3_triggered")),
"tom_references": _parse_arr(_get("tom_references")),
"risks": _parse_arr(_get("risks")),
"mitigations": _parse_arr(_get("mitigations")),
"stakeholder_consultations": _parse_arr(_get("stakeholder_consultations")),
"review_triggers": _parse_arr(_get("review_triggers")),
"review_comments": _parse_arr(_get("review_comments")),
# Section 8 / AI (Migration 028)
"ai_use_case_modules": _parse_arr(_get("ai_use_case_modules")),
"section_8_complete": _get("section_8_complete", False),
# JSONB Objects
"threshold_analysis": _parse_obj(_get("threshold_analysis")),
"consultation_requirement": _parse_obj(_get("consultation_requirement")),
"review_schedule": _parse_obj(_get("review_schedule")),
"section_progress": _parse_obj(_get("section_progress")),
"metadata": _parse_obj(_get("metadata")),
}
def _log_audit(
db: Session,
tenant_id: str,
dsfa_id,
action: str,
changed_by: str = "system",
old_values=None,
new_values=None,
):
import json
db.execute(
text("""
INSERT INTO compliance_dsfa_audit_log
(tenant_id, dsfa_id, action, changed_by, old_values, new_values)
VALUES
(:tenant_id, :dsfa_id, :action, :changed_by,
CAST(:old_values AS jsonb), CAST(:new_values AS jsonb))
"""),
{
"tenant_id": tenant_id,
"dsfa_id": str(dsfa_id) if dsfa_id else None,
"action": action,
"changed_by": changed_by,
"old_values": json.dumps(old_values) if old_values else None,
"new_values": json.dumps(new_values) if new_values else None,
},
)
# =============================================================================
@@ -177,8 +481,51 @@ async def create_dsfa(
service: DSFAService = Depends(get_dsfa_service),
) -> dict[str, Any]:
"""Neue DSFA erstellen."""
with translate_domain_errors():
return service.create(tenant_id, request)
import json
if request.status not in VALID_STATUSES:
raise HTTPException(status_code=422, detail=f"Ungültiger Status: {request.status}")
if request.risk_level not in VALID_RISK_LEVELS:
raise HTTPException(status_code=422, detail=f"Ungültiges Risiko-Level: {request.risk_level}")
tid = _get_tenant_id(tenant_id)
row = db.execute(
text("""
INSERT INTO compliance_dsfas
(tenant_id, title, description, status, risk_level,
processing_activity, data_categories, recipients, measures, created_by)
VALUES
(:tenant_id, :title, :description, :status, :risk_level,
:processing_activity,
CAST(:data_categories AS jsonb),
CAST(:recipients AS jsonb),
CAST(:measures AS jsonb),
:created_by)
RETURNING *
"""),
{
"tenant_id": tid,
"title": request.title,
"description": request.description,
"status": request.status,
"risk_level": request.risk_level,
"processing_activity": request.processing_activity,
"data_categories": json.dumps(request.data_categories),
"recipients": json.dumps(request.recipients),
"measures": json.dumps(request.measures),
"created_by": request.created_by,
},
).fetchone()
db.flush()
row_id = row._mapping["id"] if hasattr(row, "_mapping") else row[0]
_log_audit(
db, tid, row_id, "CREATE", request.created_by,
new_values={"title": request.title, "status": request.status},
)
db.commit()
return _dsfa_to_response(row)
# =============================================================================

File diff suppressed because it is too large Load Diff

View File

@@ -22,23 +22,21 @@ from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from compliance.api._http_errors import translate_domain_errors
from compliance.db import ControlRepository, EvidenceRepository
from compliance.schemas.evidence import (
EvidenceCreate,
EvidenceListResponse,
EvidenceResponse,
from ..db import (
ControlRepository,
EvidenceRepository,
EvidenceStatusEnum,
EvidenceConfidenceEnum,
EvidenceTruthStatusEnum,
)
from compliance.services.auto_risk_updater import AutoRiskUpdater
from compliance.domain import NotFoundError, ValidationError
from compliance.services.evidence_service import (
SOURCE_CONTROL_MAP,
EvidenceService,
_extract_findings_detail, # re-exported for legacy test imports
_parse_ci_evidence, # re-exported for legacy test imports
_store_evidence, # re-exported for legacy test imports
_update_risks as _update_risks_impl,
from ..db.models import EvidenceDB, ControlDB, AuditTrailDB
from ..services.auto_risk_updater import AutoRiskUpdater
from .schemas import (
EvidenceCreate, EvidenceResponse, EvidenceListResponse,
EvidenceRejectRequest,
)
from .audit_trail_utils import log_audit_trail
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-evidence"])
@@ -56,7 +54,88 @@ def get_evidence_service(db: Session = Depends(get_db)) -> EvidenceService:
# ============================================================================
# Evidence CRUD
# Anti-Fake-Evidence: Four-Eyes Domain Check
# ============================================================================
FOUR_EYES_DOMAINS = {"gov", "priv"}
def _requires_four_eyes(control_domain: str) -> bool:
"""Controls in governance/privacy domains require two independent reviewers."""
return control_domain in FOUR_EYES_DOMAINS
# ============================================================================
# Anti-Fake-Evidence: Auto-Classification Helpers
# ============================================================================
def _classify_confidence(source: Optional[str], evidence_type: Optional[str] = None, artifact_hash: Optional[str] = None) -> EvidenceConfidenceEnum:
"""Classify evidence confidence level based on source and metadata."""
if source == "ci_pipeline":
return EvidenceConfidenceEnum.E3
if source == "api" and artifact_hash:
return EvidenceConfidenceEnum.E3
if source == "api":
return EvidenceConfidenceEnum.E3
if source in ("manual", "upload"):
return EvidenceConfidenceEnum.E1
if source == "generated":
return EvidenceConfidenceEnum.E0
# Default for unknown sources
return EvidenceConfidenceEnum.E1
def _classify_truth_status(source: Optional[str]) -> EvidenceTruthStatusEnum:
"""Classify evidence truth status based on source."""
if source == "ci_pipeline":
return EvidenceTruthStatusEnum.OBSERVED
if source in ("manual", "upload"):
return EvidenceTruthStatusEnum.UPLOADED
if source == "generated":
return EvidenceTruthStatusEnum.GENERATED
if source == "api":
return EvidenceTruthStatusEnum.OBSERVED
return EvidenceTruthStatusEnum.UPLOADED
def _build_evidence_response(e: EvidenceDB) -> EvidenceResponse:
"""Build an EvidenceResponse from an EvidenceDB, including anti-fake fields."""
return EvidenceResponse(
id=e.id,
control_id=e.control_id,
evidence_type=e.evidence_type,
title=e.title,
description=e.description,
artifact_path=e.artifact_path,
artifact_url=e.artifact_url,
artifact_hash=e.artifact_hash,
file_size_bytes=e.file_size_bytes,
mime_type=e.mime_type,
valid_from=e.valid_from,
valid_until=e.valid_until,
status=e.status.value if e.status else None,
source=e.source,
ci_job_id=e.ci_job_id,
uploaded_by=e.uploaded_by,
collected_at=e.collected_at,
created_at=e.created_at,
confidence_level=e.confidence_level.value if e.confidence_level else None,
truth_status=e.truth_status.value if e.truth_status else None,
generation_mode=e.generation_mode,
may_be_used_as_evidence=e.may_be_used_as_evidence,
reviewed_by=e.reviewed_by,
reviewed_at=e.reviewed_at,
approval_status=e.approval_status,
first_reviewer=e.first_reviewer,
first_reviewed_at=e.first_reviewed_at,
second_reviewer=e.second_reviewer,
second_reviewed_at=e.second_reviewed_at,
requires_four_eyes=e.requires_four_eyes,
)
# ============================================================================
# Evidence
# ============================================================================
@router.get("/evidence", response_model=EvidenceListResponse)
@@ -69,8 +148,38 @@ async def list_evidence(
service: EvidenceService = Depends(get_evidence_service),
) -> EvidenceListResponse:
"""List evidence with optional filters and pagination."""
with translate_domain_errors():
return service.list_evidence(control_id, evidence_type, status, page, limit)
repo = EvidenceRepository(db)
if control_id:
# First get the control UUID
ctrl_repo = ControlRepository(db)
control = ctrl_repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
evidence = repo.get_by_control(control.id)
else:
evidence = repo.get_all()
if evidence_type:
evidence = [e for e in evidence if e.evidence_type == evidence_type]
if status:
try:
status_enum = EvidenceStatusEnum(status)
evidence = [e for e in evidence if e.status == status_enum]
except ValueError:
pass
total = len(evidence)
# Apply pagination if requested
if page is not None and limit is not None:
offset = (page - 1) * limit
evidence = evidence[offset:offset + limit]
results = [_build_evidence_response(e) for e in evidence]
return EvidenceListResponse(evidence=results, total=total)
@router.post("/evidence", response_model=EvidenceResponse)
@@ -79,8 +188,66 @@ async def create_evidence(
service: EvidenceService = Depends(get_evidence_service),
) -> EvidenceResponse:
"""Create new evidence record."""
with translate_domain_errors():
return service.create_evidence(evidence_data)
repo = EvidenceRepository(db)
# Get control UUID
ctrl_repo = ControlRepository(db)
control = ctrl_repo.get_by_control_id(evidence_data.control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {evidence_data.control_id} not found")
source = evidence_data.source or "api"
confidence = _classify_confidence(source, evidence_data.evidence_type)
truth = _classify_truth_status(source)
# Allow explicit override from request
if evidence_data.confidence_level:
try:
confidence = EvidenceConfidenceEnum(evidence_data.confidence_level)
except ValueError:
pass
if evidence_data.truth_status:
try:
truth = EvidenceTruthStatusEnum(evidence_data.truth_status)
except ValueError:
pass
evidence = repo.create(
control_id=control.id,
evidence_type=evidence_data.evidence_type,
title=evidence_data.title,
description=evidence_data.description,
artifact_url=evidence_data.artifact_url,
valid_from=evidence_data.valid_from,
valid_until=evidence_data.valid_until,
source=source,
ci_job_id=evidence_data.ci_job_id,
)
# Set anti-fake-evidence fields
evidence.confidence_level = confidence
evidence.truth_status = truth
# Generated evidence should not be used as evidence by default
if truth == EvidenceTruthStatusEnum.GENERATED:
evidence.may_be_used_as_evidence = False
# Four-Eyes: check if the linked control's domain requires it
control_domain = control.domain.value if control.domain else ""
if _requires_four_eyes(control_domain):
evidence.requires_four_eyes = True
evidence.approval_status = "pending_first"
db.commit()
# Audit trail
log_audit_trail(
db, "evidence", evidence.id, evidence.title, "create",
performed_by=evidence_data.source or "api",
change_summary=f"Evidence created with confidence={confidence.value}, truth={truth.value}",
)
db.commit()
return _build_evidence_response(evidence)
@router.delete("/evidence/{evidence_id}")
@@ -107,9 +274,271 @@ async def upload_evidence(
service: EvidenceService = Depends(get_evidence_service),
) -> EvidenceResponse:
"""Upload evidence file."""
with translate_domain_errors():
return await service.upload_evidence(
control_id, evidence_type, title, file, description
# Get control UUID
ctrl_repo = ControlRepository(db)
control = ctrl_repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
# Create upload directory
upload_dir = f"/tmp/compliance_evidence/{control_id}"
os.makedirs(upload_dir, exist_ok=True)
# Save file
file_path = os.path.join(upload_dir, file.filename)
content = await file.read()
with open(file_path, "wb") as f:
f.write(content)
# Calculate hash
file_hash = hashlib.sha256(content).hexdigest()
# Create evidence record
repo = EvidenceRepository(db)
evidence = repo.create(
control_id=control.id,
evidence_type=evidence_type,
title=title,
description=description,
artifact_path=file_path,
artifact_hash=file_hash,
file_size_bytes=len(content),
mime_type=file.content_type,
source="upload",
)
# Upload evidence → E1 + uploaded
evidence.confidence_level = EvidenceConfidenceEnum.E1
evidence.truth_status = EvidenceTruthStatusEnum.UPLOADED
# Four-Eyes: check if the linked control's domain requires it
control_domain = control.domain.value if control.domain else ""
if _requires_four_eyes(control_domain):
evidence.requires_four_eyes = True
evidence.approval_status = "pending_first"
db.commit()
return _build_evidence_response(evidence)
# ============================================================================
# CI/CD Evidence Collection — helpers
# ============================================================================
# Map CI source names to the corresponding control IDs
SOURCE_CONTROL_MAP = {
"sast": "SDLC-001",
"dependency_scan": "SDLC-002",
"secret_scan": "SDLC-003",
"code_review": "SDLC-004",
"sbom": "SDLC-005",
"container_scan": "SDLC-006",
"test_results": "AUD-001",
}
def _parse_ci_evidence(data: dict) -> dict:
"""
Parse and validate incoming CI evidence data.
Returns a dict with:
- report_json: str (serialised JSON)
- report_hash: str (SHA-256 hex digest)
- evidence_status: str ("valid" or "failed")
- findings_count: int
- critical_findings: int
"""
report_json = json.dumps(data) if data else "{}"
report_hash = hashlib.sha256(report_json.encode()).hexdigest()
findings_count = 0
critical_findings = 0
if data and isinstance(data, dict):
# Semgrep format
if "results" in data:
findings_count = len(data.get("results", []))
critical_findings = len([
r for r in data.get("results", [])
if r.get("extra", {}).get("severity", "").upper() in ["CRITICAL", "HIGH"]
])
# Trivy format
elif "Results" in data:
for result in data.get("Results", []):
vulns = result.get("Vulnerabilities", [])
findings_count += len(vulns)
critical_findings += len([
v for v in vulns
if v.get("Severity", "").upper() in ["CRITICAL", "HIGH"]
])
# Generic findings array
elif "findings" in data:
findings_count = len(data.get("findings", []))
# SBOM format - just count components
elif "components" in data:
findings_count = len(data.get("components", []))
evidence_status = "failed" if critical_findings > 0 else "valid"
return {
"report_json": report_json,
"report_hash": report_hash,
"evidence_status": evidence_status,
"findings_count": findings_count,
"critical_findings": critical_findings,
}
def _store_evidence(
db: Session,
*,
control_db_id: str,
source: str,
parsed: dict,
ci_job_id: str,
ci_job_url: str,
report_data: dict,
) -> EvidenceDB:
"""
Persist a CI evidence item to the database and write the report file.
Returns the created EvidenceDB instance (already committed).
"""
findings_count = parsed["findings_count"]
critical_findings = parsed["critical_findings"]
# Build title and description
title = f"{source.upper()} Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}"
description = "Automatically collected from CI/CD pipeline"
if findings_count > 0:
description += f"\n- Total findings: {findings_count}"
if critical_findings > 0:
description += f"\n- Critical/High findings: {critical_findings}"
if ci_job_id:
description += f"\n- CI Job ID: {ci_job_id}"
if ci_job_url:
description += f"\n- CI Job URL: {ci_job_url}"
# Store report file
upload_dir = f"/tmp/compliance_evidence/ci/{source}"
os.makedirs(upload_dir, exist_ok=True)
file_name = f"{source}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{parsed['report_hash'][:8]}.json"
file_path = os.path.join(upload_dir, file_name)
with open(file_path, "w") as f:
json.dump(report_data or {}, f, indent=2)
# Create evidence record with anti-fake-evidence classification
evidence = EvidenceDB(
id=str(uuid_module.uuid4()),
control_id=control_db_id,
evidence_type=f"ci_{source}",
title=title,
description=description,
artifact_path=file_path,
artifact_hash=parsed["report_hash"],
file_size_bytes=len(parsed["report_json"]),
mime_type="application/json",
source="ci_pipeline",
ci_job_id=ci_job_id,
valid_from=datetime.utcnow(),
valid_until=datetime.utcnow() + timedelta(days=90),
status=EvidenceStatusEnum(parsed["evidence_status"]),
# CI pipeline evidence → E3 observed (system-observed, hash-verified)
confidence_level=EvidenceConfidenceEnum.E3,
truth_status=EvidenceTruthStatusEnum.OBSERVED,
may_be_used_as_evidence=True,
)
db.add(evidence)
db.commit()
db.refresh(evidence)
return evidence
def _extract_findings_detail(report_data: dict) -> dict:
"""
Extract severity-bucketed finding counts from report data.
Returns dict with keys: critical, high, medium, low.
"""
findings_detail = {
"critical": 0,
"high": 0,
"medium": 0,
"low": 0,
}
if not report_data:
return findings_detail
# Semgrep format
if "results" in report_data:
for r in report_data.get("results", []):
severity = r.get("extra", {}).get("severity", "").upper()
if severity == "CRITICAL":
findings_detail["critical"] += 1
elif severity == "HIGH":
findings_detail["high"] += 1
elif severity == "MEDIUM":
findings_detail["medium"] += 1
elif severity in ["LOW", "INFO"]:
findings_detail["low"] += 1
# Trivy format
elif "Results" in report_data:
for result in report_data.get("Results", []):
for v in result.get("Vulnerabilities", []):
severity = v.get("Severity", "").upper()
if severity == "CRITICAL":
findings_detail["critical"] += 1
elif severity == "HIGH":
findings_detail["high"] += 1
elif severity == "MEDIUM":
findings_detail["medium"] += 1
elif severity == "LOW":
findings_detail["low"] += 1
# Generic findings with severity
elif "findings" in report_data:
for f in report_data.get("findings", []):
severity = f.get("severity", "").upper()
if severity == "CRITICAL":
findings_detail["critical"] += 1
elif severity == "HIGH":
findings_detail["high"] += 1
elif severity == "MEDIUM":
findings_detail["medium"] += 1
else:
findings_detail["low"] += 1
return findings_detail
def _update_risks(db: Session, *, source: str, control_id: str, ci_job_id: str, report_data: dict):
"""
Update risk status based on new evidence.
Uses AutoRiskUpdater to update Control status and linked Risks based on
severity-bucketed findings. Returns the update result or None on error.
"""
findings_detail = _extract_findings_detail(report_data)
try:
auto_updater = AutoRiskUpdater(db)
risk_update_result = auto_updater.process_evidence_collect_request(
tool=source,
control_id=control_id,
evidence_type=f"ci_{source}",
timestamp=datetime.utcnow().isoformat(),
commit_sha=report_data.get("commit_sha", "unknown") if report_data else "unknown",
ci_job_id=ci_job_id,
findings=findings_detail,
)
@@ -227,14 +656,229 @@ async def get_ci_evidence_status(
# Legacy re-exports for tests that import helpers directly.
# ----------------------------------------------------------------------------
__all__ = [
"router",
"SOURCE_CONTROL_MAP",
"EvidenceRepository",
"ControlRepository",
"AutoRiskUpdater",
"_parse_ci_evidence",
"_extract_findings_detail",
"_store_evidence",
"_update_risks",
]
if control_id:
ctrl_repo = ControlRepository(db)
control = ctrl_repo.get_by_control_id(control_id)
if control:
query = query.filter(EvidenceDB.control_id == control.id)
evidence_list = query.order_by(EvidenceDB.collected_at.desc()).limit(100).all()
# Group by control and calculate stats
control_stats = defaultdict(lambda: {
"total": 0,
"valid": 0,
"failed": 0,
"last_collected": None,
"evidence": [],
})
for e in evidence_list:
# Get control_id string
control = db.query(ControlDB).filter(ControlDB.id == e.control_id).first()
ctrl_id = control.control_id if control else "unknown"
stats = control_stats[ctrl_id]
stats["total"] += 1
if e.status:
if e.status.value == "valid":
stats["valid"] += 1
elif e.status.value == "failed":
stats["failed"] += 1
if not stats["last_collected"] or e.collected_at > stats["last_collected"]:
stats["last_collected"] = e.collected_at
# Add evidence summary
stats["evidence"].append({
"id": e.id,
"type": e.evidence_type,
"status": e.status.value if e.status else None,
"collected_at": e.collected_at.isoformat() if e.collected_at else None,
"ci_job_id": e.ci_job_id,
})
# Convert to list and sort
result = []
for ctrl_id, stats in control_stats.items():
result.append({
"control_id": ctrl_id,
"total_evidence": stats["total"],
"valid_count": stats["valid"],
"failed_count": stats["failed"],
"last_collected": stats["last_collected"].isoformat() if stats["last_collected"] else None,
"recent_evidence": stats["evidence"][:5],
})
result.sort(key=lambda x: x["last_collected"] or "", reverse=True)
return {
"period_days": days,
"total_evidence": len(evidence_list),
"controls": result,
}
# ============================================================================
# Evidence Review (Anti-Fake-Evidence)
# ============================================================================
from pydantic import BaseModel as _BaseModel
class _EvidenceReviewRequest(_BaseModel):
confidence_level: Optional[str] = None
truth_status: Optional[str] = None
reviewed_by: str
@router.patch("/evidence/{evidence_id}/review", response_model=EvidenceResponse)
async def review_evidence(
evidence_id: str,
review: _EvidenceReviewRequest,
db: Session = Depends(get_db),
):
"""
Review evidence: upgrade confidence level and/or change truth status.
For Four-Eyes evidence, the first reviewer sets first_reviewer and
approval_status='first_approved'. A second (different) reviewer then
sets second_reviewer and approval_status='approved'.
"""
evidence = db.query(EvidenceDB).filter(EvidenceDB.id == evidence_id).first()
if not evidence:
raise HTTPException(status_code=404, detail=f"Evidence {evidence_id} not found")
old_confidence = evidence.confidence_level.value if evidence.confidence_level else None
old_truth = evidence.truth_status.value if evidence.truth_status else None
if review.confidence_level:
try:
evidence.confidence_level = EvidenceConfidenceEnum(review.confidence_level)
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid confidence_level: {review.confidence_level}")
if review.truth_status:
try:
evidence.truth_status = EvidenceTruthStatusEnum(review.truth_status)
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid truth_status: {review.truth_status}")
# Four-Eyes branching
if evidence.requires_four_eyes:
status = evidence.approval_status or "none"
if status in ("none", "pending_first"):
evidence.first_reviewer = review.reviewed_by
evidence.first_reviewed_at = datetime.utcnow()
evidence.approval_status = "first_approved"
elif status == "first_approved":
if review.reviewed_by == evidence.first_reviewer:
raise HTTPException(
status_code=400,
detail="Four-Eyes: second reviewer must be different from first reviewer",
)
evidence.second_reviewer = review.reviewed_by
evidence.second_reviewed_at = datetime.utcnow()
evidence.approval_status = "approved"
elif status == "approved":
raise HTTPException(status_code=400, detail="Evidence already approved")
elif status == "rejected":
raise HTTPException(status_code=400, detail="Evidence was rejected — create new evidence instead")
evidence.reviewed_by = review.reviewed_by
evidence.reviewed_at = datetime.utcnow()
db.commit()
# Audit trail
new_confidence = evidence.confidence_level.value if evidence.confidence_level else None
if old_confidence != new_confidence:
log_audit_trail(
db, "evidence", evidence_id, evidence.title, "review",
performed_by=review.reviewed_by,
field_changed="confidence_level",
old_value=old_confidence,
new_value=new_confidence,
)
new_truth = evidence.truth_status.value if evidence.truth_status else None
if old_truth != new_truth:
log_audit_trail(
db, "evidence", evidence_id, evidence.title, "review",
performed_by=review.reviewed_by,
field_changed="truth_status",
old_value=old_truth,
new_value=new_truth,
)
db.commit()
db.refresh(evidence)
return _build_evidence_response(evidence)
@router.patch("/evidence/{evidence_id}/reject", response_model=EvidenceResponse)
async def reject_evidence(
evidence_id: str,
body: EvidenceRejectRequest,
db: Session = Depends(get_db),
):
"""Reject evidence (sets approval_status='rejected')."""
evidence = db.query(EvidenceDB).filter(EvidenceDB.id == evidence_id).first()
if not evidence:
raise HTTPException(status_code=404, detail=f"Evidence {evidence_id} not found")
evidence.approval_status = "rejected"
evidence.reviewed_by = body.reviewed_by
evidence.reviewed_at = datetime.utcnow()
db.commit()
log_audit_trail(
db, "evidence", evidence_id, evidence.title, "reject",
performed_by=body.reviewed_by,
change_summary=body.rejection_reason or "Evidence rejected",
)
db.commit()
db.refresh(evidence)
return _build_evidence_response(evidence)
# ============================================================================
# Audit Trail Query
# ============================================================================
@router.get("/audit-trail")
async def get_audit_trail(
entity_type: Optional[str] = Query(None),
entity_id: Optional[str] = Query(None),
action: Optional[str] = Query(None),
limit: int = Query(50, ge=1, le=200),
db: Session = Depends(get_db),
):
"""Query audit trail entries for an entity."""
query = db.query(AuditTrailDB)
if entity_type:
query = query.filter(AuditTrailDB.entity_type == entity_type)
if entity_id:
query = query.filter(AuditTrailDB.entity_id == entity_id)
if action:
query = query.filter(AuditTrailDB.action == action)
records = query.order_by(AuditTrailDB.performed_at.desc()).limit(limit).all()
return {
"entries": [
{
"id": r.id,
"entity_type": r.entity_type,
"entity_id": r.entity_id,
"entity_name": r.entity_name,
"action": r.action,
"field_changed": r.field_changed,
"old_value": r.old_value,
"new_value": r.new_value,
"change_summary": r.change_summary,
"performed_by": r.performed_by,
"performed_at": r.performed_at.isoformat() if r.performed_at else None,
"checksum": r.checksum,
}
for r in records
],
"total": len(records),
}

View File

@@ -39,7 +39,6 @@ router = APIRouter(tags=["extraction"])
ALL_COLLECTIONS = [
"bp_compliance_ce", # BSI-TR documents — primary Prüfaspekte source
"bp_compliance_recht", # Legal texts (GDPR, AI Act, ...)
"bp_compliance_gesetze", # German laws
"bp_compliance_datenschutz", # Data protection documents
"bp_dsfa_corpus", # DSFA corpus

View File

@@ -80,9 +80,13 @@ def _handle(func, *args, **kwargs): # type: ignore[no-untyped-def]
raise HTTPException(status_code=400, detail=str(exc))
# ============================================================================
# ISMS Scope (ISO 27001 4.3)
# ============================================================================
# Shared audit trail utilities — canonical implementation in audit_trail_utils.py
from .audit_trail_utils import log_audit_trail, create_signature # noqa: E402
# =============================================================================
# ISMS SCOPE (ISO 27001 4.3)
# =============================================================================
@router.get("/scope", response_model=ISMSScopeResponse)
async def get_isms_scope(db: Session = Depends(get_db)):

View File

@@ -50,6 +50,57 @@ VALID_DOCUMENT_TYPES = {
"cookie_banner",
"agb",
"clause",
# Security document templates (Migration 051)
"it_security_concept",
"data_protection_concept",
"backup_recovery_concept",
"logging_concept",
"incident_response_plan",
"access_control_concept",
"risk_management_concept",
# Policy templates — IT Security (Migration 054)
"information_security_policy",
"access_control_policy",
"password_policy",
"encryption_policy",
"logging_policy",
"backup_policy",
"incident_response_policy",
"change_management_policy",
"patch_management_policy",
"asset_management_policy",
"cloud_security_policy",
"devsecops_policy",
"secrets_management_policy",
"vulnerability_management_policy",
# Policy templates — Data (Migration 054)
"data_protection_policy",
"data_classification_policy",
"data_retention_policy",
"data_transfer_policy",
"privacy_incident_policy",
# Policy templates — Personnel (Migration 054)
"employee_security_policy",
"security_awareness_policy",
"remote_work_policy",
"offboarding_policy",
# Policy templates — Vendor/Supply Chain (Migration 054)
"vendor_risk_management_policy",
"third_party_security_policy",
"supplier_security_policy",
# Policy templates — BCM (Migration 054)
"business_continuity_policy",
"disaster_recovery_policy",
"crisis_management_policy",
# CRA Cybersecurity (Migration 056)
"cybersecurity_policy",
# DSFA template
"dsfa",
# Module document templates (Migration 073)
"vvt_register",
"tom_documentation",
"loeschkonzept",
"pflichtenregister",
}
VALID_STATUSES = {"published", "draft", "archived"}

View File

@@ -0,0 +1,162 @@
"""
FastAPI routes for LLM Generation Audit Trail.
Endpoints:
- POST /llm-audit: Record an LLM generation event
- GET /llm-audit: List audit records with filters
"""
import logging
import uuid as uuid_module
from datetime import datetime
from typing import Optional
from fastapi import APIRouter, Depends, Query
from pydantic import BaseModel
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from ..db.models import LLMGenerationAuditDB
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-llm-audit"])
# ============================================================================
# Schemas
# ============================================================================
class LLMAuditCreate(BaseModel):
entity_type: str
entity_id: Optional[str] = None
generation_mode: str
truth_status: str = "generated"
may_be_used_as_evidence: bool = False
llm_model: Optional[str] = None
llm_provider: Optional[str] = None
prompt_hash: Optional[str] = None
input_summary: Optional[str] = None
output_summary: Optional[str] = None
metadata: Optional[dict] = None
tenant_id: Optional[str] = None
class LLMAuditResponse(BaseModel):
id: str
tenant_id: Optional[str] = None
entity_type: str
entity_id: Optional[str] = None
generation_mode: str
truth_status: str
may_be_used_as_evidence: bool
llm_model: Optional[str] = None
llm_provider: Optional[str] = None
prompt_hash: Optional[str] = None
input_summary: Optional[str] = None
output_summary: Optional[str] = None
metadata: Optional[dict] = None
created_at: datetime
class Config:
from_attributes = True
# ============================================================================
# Routes
# ============================================================================
@router.post("/llm-audit", response_model=LLMAuditResponse)
async def create_llm_audit(
data: LLMAuditCreate,
db: Session = Depends(get_db),
):
"""Record an LLM generation event for audit trail."""
from ..db.models import EvidenceTruthStatusEnum
# Validate truth_status
try:
truth_enum = EvidenceTruthStatusEnum(data.truth_status)
except ValueError:
truth_enum = EvidenceTruthStatusEnum.GENERATED
record = LLMGenerationAuditDB(
id=str(uuid_module.uuid4()),
tenant_id=data.tenant_id,
entity_type=data.entity_type,
entity_id=data.entity_id,
generation_mode=data.generation_mode,
truth_status=truth_enum,
may_be_used_as_evidence=data.may_be_used_as_evidence,
llm_model=data.llm_model,
llm_provider=data.llm_provider,
prompt_hash=data.prompt_hash,
input_summary=data.input_summary[:500] if data.input_summary else None,
output_summary=data.output_summary[:500] if data.output_summary else None,
extra_metadata=data.metadata or {},
)
db.add(record)
db.commit()
db.refresh(record)
return LLMAuditResponse(
id=record.id,
tenant_id=record.tenant_id,
entity_type=record.entity_type,
entity_id=record.entity_id,
generation_mode=record.generation_mode,
truth_status=record.truth_status.value if record.truth_status else "generated",
may_be_used_as_evidence=record.may_be_used_as_evidence,
llm_model=record.llm_model,
llm_provider=record.llm_provider,
prompt_hash=record.prompt_hash,
input_summary=record.input_summary,
output_summary=record.output_summary,
metadata=record.extra_metadata,
created_at=record.created_at,
)
@router.get("/llm-audit")
async def list_llm_audit(
entity_type: Optional[str] = Query(None),
entity_id: Optional[str] = Query(None),
page: int = Query(1, ge=1),
limit: int = Query(50, ge=1, le=200),
db: Session = Depends(get_db),
):
"""List LLM generation audit records with optional filters."""
query = db.query(LLMGenerationAuditDB)
if entity_type:
query = query.filter(LLMGenerationAuditDB.entity_type == entity_type)
if entity_id:
query = query.filter(LLMGenerationAuditDB.entity_id == entity_id)
total = query.count()
offset = (page - 1) * limit
records = query.order_by(LLMGenerationAuditDB.created_at.desc()).offset(offset).limit(limit).all()
return {
"records": [
LLMAuditResponse(
id=r.id,
tenant_id=r.tenant_id,
entity_type=r.entity_type,
entity_id=r.entity_id,
generation_mode=r.generation_mode,
truth_status=r.truth_status.value if r.truth_status else "generated",
may_be_used_as_evidence=r.may_be_used_as_evidence,
llm_model=r.llm_model,
llm_provider=r.llm_provider,
prompt_hash=r.prompt_hash,
input_summary=r.input_summary,
output_summary=r.output_summary,
metadata=r.extra_metadata,
created_at=r.created_at,
)
for r in records
],
"total": total,
"page": page,
"limit": limit,
}

View File

@@ -56,6 +56,7 @@ class LoeschfristCreate(BaseModel):
responsible_person: Optional[str] = None
release_process: Optional[str] = None
linked_vvt_activity_ids: Optional[List[Any]] = None
linked_vendor_ids: Optional[List[Any]] = None
status: str = "DRAFT"
last_review_date: Optional[datetime] = None
next_review_date: Optional[datetime] = None
@@ -86,6 +87,7 @@ class LoeschfristUpdate(BaseModel):
responsible_person: Optional[str] = None
release_process: Optional[str] = None
linked_vvt_activity_ids: Optional[List[Any]] = None
linked_vendor_ids: Optional[List[Any]] = None
status: Optional[str] = None
last_review_date: Optional[datetime] = None
next_review_date: Optional[datetime] = None
@@ -100,7 +102,7 @@ class StatusUpdate(BaseModel):
# JSONB fields that need CAST
JSONB_FIELDS = {
"affected_groups", "data_categories", "legal_holds",
"storage_locations", "linked_vvt_activity_ids", "tags"
"storage_locations", "linked_vvt_activity_ids", "linked_vendor_ids", "tags"
}

View File

@@ -42,6 +42,7 @@ class ObligationCreate(BaseModel):
priority: str = "medium"
responsible: Optional[str] = None
linked_systems: Optional[List[str]] = None
linked_vendor_ids: Optional[List[str]] = None
assessment_id: Optional[str] = None
rule_code: Optional[str] = None
notes: Optional[str] = None
@@ -57,6 +58,7 @@ class ObligationUpdate(BaseModel):
priority: Optional[str] = None
responsible: Optional[str] = None
linked_systems: Optional[List[str]] = None
linked_vendor_ids: Optional[List[str]] = None
notes: Optional[str] = None
@@ -173,14 +175,15 @@ async def create_obligation(
import json
linked_systems = json.dumps(payload.linked_systems or [])
linked_vendor_ids = json.dumps(payload.linked_vendor_ids or [])
row = db.execute(text("""
INSERT INTO compliance_obligations
(tenant_id, title, description, source, source_article, deadline,
status, priority, responsible, linked_systems, assessment_id, rule_code, notes)
status, priority, responsible, linked_systems, linked_vendor_ids, assessment_id, rule_code, notes)
VALUES
(:tenant_id, :title, :description, :source, :source_article, :deadline,
:status, :priority, :responsible, CAST(:linked_systems AS jsonb), :assessment_id, :rule_code, :notes)
:status, :priority, :responsible, CAST(:linked_systems AS jsonb), CAST(:linked_vendor_ids AS jsonb), :assessment_id, :rule_code, :notes)
RETURNING *
"""), {
"tenant_id": tenant_id,
@@ -193,6 +196,7 @@ async def create_obligation(
"priority": payload.priority,
"responsible": payload.responsible,
"linked_systems": linked_systems,
"linked_vendor_ids": linked_vendor_ids,
"assessment_id": payload.assessment_id,
"rule_code": payload.rule_code,
"notes": payload.notes,
@@ -235,6 +239,9 @@ async def update_obligation(
if field == "linked_systems":
updates["linked_systems"] = json.dumps(value or [])
set_clauses.append("linked_systems = CAST(:linked_systems AS jsonb)")
elif field == "linked_vendor_ids":
updates["linked_vendor_ids"] = json.dumps(value or [])
set_clauses.append("linked_vendor_ids = CAST(:linked_vendor_ids AS jsonb)")
else:
updates[field] = value
set_clauses.append(f"{field} = :{field}")

File diff suppressed because it is too large Load Diff

View File

@@ -25,6 +25,7 @@ from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from .audit_trail_utils import log_audit_trail
from ..db import (
ControlDomainEnum,
ControlRepository,
@@ -312,8 +313,39 @@ async def get_control(
svc: ControlExportService = Depends(get_ctrl_export_service),
) -> ControlResponse:
"""Get a specific control by control_id."""
with translate_domain_errors():
return svc.get_control(control_id)
repo = ControlRepository(db)
control = repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
evidence_repo = EvidenceRepository(db)
evidence = evidence_repo.get_by_control(control.id)
return ControlResponse(
id=control.id,
control_id=control.control_id,
domain=control.domain.value if control.domain else None,
control_type=control.control_type.value if control.control_type else None,
title=control.title,
description=control.description,
pass_criteria=control.pass_criteria,
implementation_guidance=control.implementation_guidance,
code_reference=control.code_reference,
documentation_url=control.documentation_url,
is_automated=control.is_automated,
automation_tool=control.automation_tool,
automation_config=control.automation_config,
owner=control.owner,
review_frequency_days=control.review_frequency_days,
status=control.status.value if control.status else None,
status_notes=control.status_notes,
status_justification=control.status_justification,
last_reviewed_at=control.last_reviewed_at,
next_review_at=control.next_review_at,
created_at=control.created_at,
updated_at=control.updated_at,
evidence_count=len(evidence),
)
@router.put(
@@ -325,8 +357,83 @@ async def update_control(
svc: ControlExportService = Depends(get_ctrl_export_service),
) -> ControlResponse:
"""Update a control."""
with translate_domain_errors():
return svc.update_control(control_id, update)
repo = ControlRepository(db)
control = repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
update_data = update.model_dump(exclude_unset=True)
# Convert status string to enum and validate transition
if "status" in update_data:
try:
new_status_enum = ControlStatusEnum(update_data["status"])
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid status: {update_data['status']}")
# Validate status transition (Anti-Fake-Evidence)
from ..services.control_status_machine import validate_transition
current_status = control.status.value if control.status else "planned"
evidence_list = db.query(EvidenceDB).filter(EvidenceDB.control_id == control.id).all()
allowed, violations = validate_transition(
current_status=current_status,
new_status=update_data["status"],
evidence_list=evidence_list,
status_justification=update_data.get("status_justification") or update_data.get("status_notes"),
)
if not allowed:
raise HTTPException(
status_code=409,
detail={
"error": "Status transition not allowed",
"current_status": current_status,
"requested_status": update_data["status"],
"violations": violations,
}
)
update_data["status"] = new_status_enum
updated = repo.update(control.id, **update_data)
db.commit()
# Audit trail for status changes
new_status = updated.status.value if updated.status else None
if "status" in update.model_dump(exclude_unset=True) and current_status != new_status:
log_audit_trail(
db, "control", control.id, updated.control_id or updated.title,
"status_change",
performed_by=update.owner or "system",
field_changed="status",
old_value=current_status,
new_value=new_status,
)
db.commit()
return ControlResponse(
id=updated.id,
control_id=updated.control_id,
domain=updated.domain.value if updated.domain else None,
control_type=updated.control_type.value if updated.control_type else None,
title=updated.title,
description=updated.description,
pass_criteria=updated.pass_criteria,
implementation_guidance=updated.implementation_guidance,
code_reference=updated.code_reference,
documentation_url=updated.documentation_url,
is_automated=updated.is_automated,
automation_tool=updated.automation_tool,
automation_config=updated.automation_config,
owner=updated.owner,
review_frequency_days=updated.review_frequency_days,
status=updated.status.value if updated.status else None,
status_notes=updated.status_notes,
status_justification=updated.status_justification,
last_reviewed_at=updated.last_reviewed_at,
next_review_at=updated.next_review_at,
created_at=updated.created_at,
updated_at=updated.updated_at,
)
@router.put(
@@ -339,8 +446,43 @@ async def review_control(
svc: ControlExportService = Depends(get_ctrl_export_service),
) -> ControlResponse:
"""Mark a control as reviewed with new status."""
with translate_domain_errors():
return svc.review_control(control_id, review)
repo = ControlRepository(db)
control = repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
try:
status_enum = ControlStatusEnum(review.status)
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid status: {review.status}")
updated = repo.mark_reviewed(control.id, status_enum, review.status_notes)
db.commit()
return ControlResponse(
id=updated.id,
control_id=updated.control_id,
domain=updated.domain.value if updated.domain else None,
control_type=updated.control_type.value if updated.control_type else None,
title=updated.title,
description=updated.description,
pass_criteria=updated.pass_criteria,
implementation_guidance=updated.implementation_guidance,
code_reference=updated.code_reference,
documentation_url=updated.documentation_url,
is_automated=updated.is_automated,
automation_tool=updated.automation_tool,
automation_config=updated.automation_config,
owner=updated.owner,
review_frequency_days=updated.review_frequency_days,
status=updated.status.value if updated.status else None,
status_notes=updated.status_notes,
status_justification=updated.status_justification,
last_reviewed_at=updated.last_reviewed_at,
next_review_at=updated.next_review_at,
created_at=updated.created_at,
updated_at=updated.updated_at,
)
@router.get(

File diff suppressed because it is too large Load Diff

View File

@@ -22,7 +22,9 @@ import uuid
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
import httpx
from fastapi import APIRouter, File, Form, UploadFile, HTTPException
from pydantic import BaseModel
from sqlalchemy import text
from database import SessionLocal # re-exported below for legacy test patches
@@ -96,15 +98,13 @@ async def scan_dependencies(
db = SessionLocal()
try:
db.execute(
text(
"INSERT INTO compliance_screenings "
"(id, tenant_id, status, sbom_format, sbom_version, "
"total_components, total_issues, critical_issues, high_issues, "
"medium_issues, low_issues, sbom_data, started_at, completed_at) "
"VALUES (:id, :tenant_id, 'completed', 'CycloneDX', '1.5', "
":total_components, :total_issues, :critical, :high, :medium, :low, "
":sbom_data::jsonb, :started_at, :completed_at)"
),
text("""INSERT INTO compliance_screenings
(id, tenant_id, status, sbom_format, sbom_version,
total_components, total_issues, critical_issues, high_issues, medium_issues, low_issues,
sbom_data, started_at, completed_at)
VALUES (:id, :tenant_id, 'completed', 'CycloneDX', '1.5',
:total_components, :total_issues, :critical, :high, :medium, :low,
:sbom_data::jsonb, :started_at, :completed_at)"""),
{
"id": screening_id,
"tenant_id": tenant_id,
@@ -121,13 +121,11 @@ async def scan_dependencies(
)
for issue in issues:
db.execute(
text(
"INSERT INTO compliance_security_issues "
"(id, screening_id, severity, title, description, cve, cvss, "
"affected_component, affected_version, fixed_in, remediation, status) "
"VALUES (:id, :screening_id, :severity, :title, :description, :cve, :cvss, "
":component, :version, :fixed_in, :remediation, :status)"
),
text("""INSERT INTO compliance_security_issues
(id, screening_id, severity, title, description, cve, cvss,
affected_component, affected_version, fixed_in, remediation, status)
VALUES (:id, :screening_id, :severity, :title, :description, :cve, :cvss,
:component, :version, :fixed_in, :remediation, :status)"""),
{
"id": issue["id"],
"screening_id": screening_id,
@@ -214,8 +212,77 @@ async def get_screening(screening_id: str) -> ScreeningResponse:
"""Get a screening result by ID."""
db = SessionLocal()
try:
with translate_domain_errors():
return ScreeningService(db).get_screening(screening_id)
result = db.execute(
text("""SELECT id, status, sbom_format, sbom_version,
total_components, total_issues, critical_issues, high_issues,
medium_issues, low_issues, sbom_data, started_at, completed_at
FROM compliance_screenings WHERE id = :id"""),
{"id": screening_id},
)
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Screening not found")
# Fetch issues
issues_result = db.execute(
text("""SELECT id, severity, title, description, cve, cvss,
affected_component, affected_version, fixed_in, remediation, status
FROM compliance_security_issues WHERE screening_id = :id"""),
{"id": screening_id},
)
issues_rows = issues_result.fetchall()
issues = [
SecurityIssueResponse(
id=str(r[0]), severity=r[1], title=r[2], description=r[3],
cve=r[4], cvss=r[5], affected_component=r[6],
affected_version=r[7], fixed_in=r[8], remediation=r[9], status=r[10],
)
for r in issues_rows
]
# Reconstruct components from SBOM data
sbom_data = row[10] or {}
components = []
comp_vulns: dict[str, list[dict]] = {}
for issue in issues:
if issue.affected_component not in comp_vulns:
comp_vulns[issue.affected_component] = []
comp_vulns[issue.affected_component].append({
"id": issue.cve or issue.id,
"cve": issue.cve,
"severity": issue.severity,
"title": issue.title,
"cvss": issue.cvss,
"fixedIn": issue.fixed_in,
})
for sc in sbom_data.get("components", []):
components.append(SBOMComponentResponse(
name=sc["name"],
version=sc["version"],
type=sc.get("type", "library"),
purl=sc.get("purl", ""),
licenses=sc.get("licenses", []),
vulnerabilities=comp_vulns.get(sc["name"], []),
))
return ScreeningResponse(
id=str(row[0]),
status=row[1],
sbom_format=row[2] or "CycloneDX",
sbom_version=row[3] or "1.5",
total_components=row[4] or 0,
total_issues=row[5] or 0,
critical_issues=row[6] or 0,
high_issues=row[7] or 0,
medium_issues=row[8] or 0,
low_issues=row[9] or 0,
components=components,
issues=issues,
started_at=str(row[11]) if row[11] else None,
completed_at=str(row[12]) if row[12] else None,
)
finally:
db.close()
@@ -225,8 +292,33 @@ async def list_screenings(tenant_id: str = "default") -> ScreeningListResponse:
"""List all screenings for a tenant."""
db = SessionLocal()
try:
with translate_domain_errors():
return ScreeningService(db).list_screenings(tenant_id)
result = db.execute(
text("""SELECT id, status, total_components, total_issues,
critical_issues, high_issues, medium_issues, low_issues,
started_at, completed_at, created_at
FROM compliance_screenings
WHERE tenant_id = :tenant_id
ORDER BY created_at DESC"""),
{"tenant_id": tenant_id},
)
rows = result.fetchall()
screenings = [
{
"id": str(r[0]),
"status": r[1],
"total_components": r[2],
"total_issues": r[3],
"critical_issues": r[4],
"high_issues": r[5],
"medium_issues": r[6],
"low_issues": r[7],
"started_at": str(r[8]) if r[8] else None,
"completed_at": str(r[9]) if r[9] else None,
"created_at": str(r[10]),
}
for r in rows
]
return ScreeningListResponse(screenings=screenings, total=len(screenings))
finally:
db.close()

View File

@@ -0,0 +1,537 @@
"""
TOM ↔ Canonical Control Mapping Routes.
Three-layer architecture:
TOM Measures (~88, audit-level) → Mapping Bridge → Canonical Controls (10,000+)
Endpoints:
POST /v1/tom-mappings/sync — Sync canonical controls for company profile
GET /v1/tom-mappings — List all mappings for tenant/project
GET /v1/tom-mappings/by-tom/{code} — Mappings for a specific TOM control
GET /v1/tom-mappings/stats — Coverage statistics
POST /v1/tom-mappings/manual — Manually add a mapping
DELETE /v1/tom-mappings/{id} — Remove a mapping
"""
from __future__ import annotations
import hashlib
import json
import logging
from typing import Any, Optional
from fastapi import APIRouter, HTTPException, Query, Header
from pydantic import BaseModel
from sqlalchemy import text
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/tom-mappings", tags=["tom-control-mappings"])
# =============================================================================
# TOM CATEGORY → CANONICAL CATEGORY MAPPING
# =============================================================================
# Maps 13 TOM control categories to canonical_control_categories
# Each TOM category maps to 1-3 canonical categories for broad coverage
TOM_TO_CANONICAL_CATEGORIES: dict[str, list[str]] = {
"ACCESS_CONTROL": ["authentication", "identity", "physical"],
"ADMISSION_CONTROL": ["authentication", "identity", "system"],
"ACCESS_AUTHORIZATION": ["authentication", "identity"],
"TRANSFER_CONTROL": ["network", "data_protection", "encryption"],
"INPUT_CONTROL": ["application", "data_protection"],
"ORDER_CONTROL": ["supply_chain", "compliance"],
"AVAILABILITY": ["continuity", "system"],
"SEPARATION": ["network", "data_protection"],
"ENCRYPTION": ["encryption"],
"PSEUDONYMIZATION": ["data_protection", "encryption"],
"RESILIENCE": ["continuity", "system"],
"RECOVERY": ["continuity"],
"REVIEW": ["compliance", "governance", "risk"],
}
# =============================================================================
# REQUEST / RESPONSE MODELS
# =============================================================================
class SyncRequest(BaseModel):
"""Trigger a sync of canonical controls to TOM measures."""
industry: Optional[str] = None
company_size: Optional[str] = None
force: bool = False
class ManualMappingRequest(BaseModel):
"""Manually add a canonical control to a TOM measure."""
tom_control_code: str
tom_category: str
canonical_control_id: str
canonical_control_code: str
canonical_category: Optional[str] = None
relevance_score: float = 1.0
# =============================================================================
# HELPERS
# =============================================================================
def _get_tenant_id(x_tenant_id: Optional[str]) -> str:
"""Extract tenant ID from header."""
if not x_tenant_id:
raise HTTPException(status_code=400, detail="X-Tenant-ID header required")
return x_tenant_id
def _compute_profile_hash(industry: Optional[str], company_size: Optional[str]) -> str:
"""Compute a hash from profile parameters for change detection."""
data = json.dumps({"industry": industry, "company_size": company_size}, sort_keys=True)
return hashlib.sha256(data.encode()).hexdigest()[:16]
def _mapping_row_to_dict(r) -> dict[str, Any]:
"""Convert a mapping row to API response dict."""
return {
"id": str(r.id),
"tenant_id": str(r.tenant_id),
"project_id": str(r.project_id) if r.project_id else None,
"tom_control_code": r.tom_control_code,
"tom_category": r.tom_category,
"canonical_control_id": str(r.canonical_control_id),
"canonical_control_code": r.canonical_control_code,
"canonical_category": r.canonical_category,
"mapping_type": r.mapping_type,
"relevance_score": float(r.relevance_score) if r.relevance_score else 1.0,
"created_at": r.created_at.isoformat() if r.created_at else None,
}
# =============================================================================
# SYNC ENDPOINT
# =============================================================================
@router.post("/sync")
async def sync_mappings(
body: SyncRequest,
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
):
"""
Sync canonical controls to TOM measures based on company profile.
Algorithm:
1. Compute profile hash → skip if unchanged (unless force=True)
2. For each TOM category, find matching canonical controls by:
- Category mapping (TOM category → canonical categories)
- Industry filter (applicable_industries JSONB containment)
- Company size filter (applicable_company_size JSONB containment)
- Only approved + customer_visible controls
3. Delete old auto-mappings, insert new ones
4. Update sync state
"""
tenant_id = _get_tenant_id(x_tenant_id)
profile_hash = _compute_profile_hash(body.industry, body.company_size)
with SessionLocal() as db:
# Check if sync is needed (profile unchanged)
if not body.force:
existing = db.execute(
text("""
SELECT profile_hash FROM tom_control_sync_state
WHERE tenant_id = :tid AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
"""),
{"tid": tenant_id, "pid": project_id},
).fetchone()
if existing and existing.profile_hash == profile_hash:
return {
"status": "unchanged",
"message": "Profile unchanged since last sync",
"profile_hash": profile_hash,
}
# Delete old auto-mappings for this tenant+project
db.execute(
text("""
DELETE FROM tom_control_mappings
WHERE tenant_id = :tid
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
AND mapping_type = 'auto'
"""),
{"tid": tenant_id, "pid": project_id},
)
total_mappings = 0
canonical_ids_matched = set()
tom_codes_covered = set()
# For each TOM category, find matching canonical controls
for tom_category, canonical_categories in TOM_TO_CANONICAL_CATEGORIES.items():
# Build JSONB containment query for categories
cat_conditions = " OR ".join(
f"category = :cat_{i}" for i in range(len(canonical_categories))
)
cat_params = {f"cat_{i}": c for i, c in enumerate(canonical_categories)}
# Build industry filter
industry_filter = ""
if body.industry:
industry_filter = """
AND (
applicable_industries IS NULL
OR applicable_industries @> '"all"'::jsonb
OR applicable_industries @> (:industry)::jsonb
)
"""
cat_params["industry"] = json.dumps([body.industry])
# Build company size filter
size_filter = ""
if body.company_size:
size_filter = """
AND (
applicable_company_size IS NULL
OR applicable_company_size @> '"all"'::jsonb
OR applicable_company_size @> (:csize)::jsonb
)
"""
cat_params["csize"] = json.dumps([body.company_size])
query = f"""
SELECT id, control_id, category
FROM canonical_controls
WHERE ({cat_conditions})
AND release_state = 'approved'
AND customer_visible = true
{industry_filter}
{size_filter}
ORDER BY control_id
"""
rows = db.execute(text(query), cat_params).fetchall()
# Find TOM control codes in this category (query the frontend library
# codes; we use the category prefix pattern from the loader)
# TOM codes follow pattern: TOM-XX-NN where XX is category abbreviation
# We insert one mapping per canonical control per TOM category
for row in rows:
db.execute(
text("""
INSERT INTO tom_control_mappings (
tenant_id, project_id, tom_control_code, tom_category,
canonical_control_id, canonical_control_code, canonical_category,
mapping_type, relevance_score
) VALUES (
:tid, :pid, :tom_cat, :tom_cat,
:cc_id, :cc_code, :cc_category,
'auto', 1.00
)
ON CONFLICT (tenant_id, project_id, tom_control_code, canonical_control_id)
DO NOTHING
"""),
{
"tid": tenant_id,
"pid": project_id,
"tom_cat": tom_category,
"cc_id": str(row.id),
"cc_code": row.control_id,
"cc_category": row.category,
},
)
total_mappings += 1
canonical_ids_matched.add(str(row.id))
tom_codes_covered.add(tom_category)
# Upsert sync state
db.execute(
text("""
INSERT INTO tom_control_sync_state (
tenant_id, project_id, profile_hash,
total_mappings, canonical_controls_matched, tom_controls_covered,
last_synced_at
) VALUES (
:tid, :pid, :hash,
:total, :matched, :covered,
NOW()
)
ON CONFLICT (tenant_id, project_id)
DO UPDATE SET
profile_hash = :hash,
total_mappings = :total,
canonical_controls_matched = :matched,
tom_controls_covered = :covered,
last_synced_at = NOW()
"""),
{
"tid": tenant_id,
"pid": project_id,
"hash": profile_hash,
"total": total_mappings,
"matched": len(canonical_ids_matched),
"covered": len(tom_codes_covered),
},
)
db.commit()
return {
"status": "synced",
"profile_hash": profile_hash,
"total_mappings": total_mappings,
"canonical_controls_matched": len(canonical_ids_matched),
"tom_categories_covered": len(tom_codes_covered),
}
# =============================================================================
# LIST MAPPINGS
# =============================================================================
@router.get("")
async def list_mappings(
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
tom_category: Optional[str] = Query(None),
mapping_type: Optional[str] = Query(None),
limit: int = Query(500, ge=1, le=5000),
offset: int = Query(0, ge=0),
):
"""List all TOM ↔ canonical control mappings for tenant/project."""
tenant_id = _get_tenant_id(x_tenant_id)
query = """
SELECT m.*, cc.title as canonical_title, cc.severity as canonical_severity
FROM tom_control_mappings m
LEFT JOIN canonical_controls cc ON cc.id = m.canonical_control_id
WHERE m.tenant_id = :tid
AND (m.project_id = :pid OR (m.project_id IS NULL AND :pid IS NULL))
"""
params: dict[str, Any] = {"tid": tenant_id, "pid": project_id}
if tom_category:
query += " AND m.tom_category = :tcat"
params["tcat"] = tom_category
if mapping_type:
query += " AND m.mapping_type = :mtype"
params["mtype"] = mapping_type
query += " ORDER BY m.tom_category, m.canonical_control_code"
query += " LIMIT :lim OFFSET :off"
params["lim"] = limit
params["off"] = offset
count_query = """
SELECT count(*) FROM tom_control_mappings
WHERE tenant_id = :tid
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
"""
count_params: dict[str, Any] = {"tid": tenant_id, "pid": project_id}
if tom_category:
count_query += " AND tom_category = :tcat"
count_params["tcat"] = tom_category
with SessionLocal() as db:
rows = db.execute(text(query), params).fetchall()
total = db.execute(text(count_query), count_params).scalar()
mappings = []
for r in rows:
d = _mapping_row_to_dict(r)
d["canonical_title"] = getattr(r, "canonical_title", None)
d["canonical_severity"] = getattr(r, "canonical_severity", None)
mappings.append(d)
return {"mappings": mappings, "total": total}
# =============================================================================
# MAPPINGS BY TOM CONTROL
# =============================================================================
@router.get("/by-tom/{tom_code}")
async def get_mappings_by_tom(
tom_code: str,
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
):
"""Get all canonical controls mapped to a specific TOM control code or category."""
tenant_id = _get_tenant_id(x_tenant_id)
with SessionLocal() as db:
rows = db.execute(
text("""
SELECT m.*, cc.title as canonical_title, cc.severity as canonical_severity,
cc.objective as canonical_objective
FROM tom_control_mappings m
LEFT JOIN canonical_controls cc ON cc.id = m.canonical_control_id
WHERE m.tenant_id = :tid
AND (m.project_id = :pid OR (m.project_id IS NULL AND :pid IS NULL))
AND (m.tom_control_code = :code OR m.tom_category = :code)
ORDER BY m.canonical_control_code
"""),
{"tid": tenant_id, "pid": project_id, "code": tom_code},
).fetchall()
mappings = []
for r in rows:
d = _mapping_row_to_dict(r)
d["canonical_title"] = getattr(r, "canonical_title", None)
d["canonical_severity"] = getattr(r, "canonical_severity", None)
d["canonical_objective"] = getattr(r, "canonical_objective", None)
mappings.append(d)
return {"tom_code": tom_code, "mappings": mappings, "total": len(mappings)}
# =============================================================================
# STATS
# =============================================================================
@router.get("/stats")
async def get_mapping_stats(
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
):
"""Coverage statistics for TOM ↔ canonical control mappings."""
tenant_id = _get_tenant_id(x_tenant_id)
with SessionLocal() as db:
# Sync state
sync_state = db.execute(
text("""
SELECT * FROM tom_control_sync_state
WHERE tenant_id = :tid
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
"""),
{"tid": tenant_id, "pid": project_id},
).fetchone()
# Per-category breakdown
category_stats = db.execute(
text("""
SELECT tom_category,
count(*) as total_mappings,
count(DISTINCT canonical_control_id) as unique_controls,
count(*) FILTER (WHERE mapping_type = 'auto') as auto_count,
count(*) FILTER (WHERE mapping_type = 'manual') as manual_count
FROM tom_control_mappings
WHERE tenant_id = :tid
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
GROUP BY tom_category
ORDER BY tom_category
"""),
{"tid": tenant_id, "pid": project_id},
).fetchall()
# Total canonical controls in DB (approved + visible)
total_canonical = db.execute(
text("""
SELECT count(*) FROM canonical_controls
WHERE release_state = 'approved' AND customer_visible = true
""")
).scalar()
return {
"sync_state": {
"profile_hash": sync_state.profile_hash if sync_state else None,
"total_mappings": sync_state.total_mappings if sync_state else 0,
"canonical_controls_matched": sync_state.canonical_controls_matched if sync_state else 0,
"tom_controls_covered": sync_state.tom_controls_covered if sync_state else 0,
"last_synced_at": sync_state.last_synced_at.isoformat() if sync_state and sync_state.last_synced_at else None,
},
"category_breakdown": [
{
"tom_category": r.tom_category,
"total_mappings": r.total_mappings,
"unique_controls": r.unique_controls,
"auto_count": r.auto_count,
"manual_count": r.manual_count,
}
for r in category_stats
],
"total_canonical_controls_available": total_canonical or 0,
}
# =============================================================================
# MANUAL MAPPING
# =============================================================================
@router.post("/manual", status_code=201)
async def add_manual_mapping(
body: ManualMappingRequest,
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
):
"""Manually add a canonical control to a TOM measure."""
tenant_id = _get_tenant_id(x_tenant_id)
with SessionLocal() as db:
# Verify canonical control exists
cc = db.execute(
text("SELECT id, control_id, category FROM canonical_controls WHERE id = CAST(:cid AS uuid)"),
{"cid": body.canonical_control_id},
).fetchone()
if not cc:
raise HTTPException(status_code=404, detail="Canonical control not found")
try:
row = db.execute(
text("""
INSERT INTO tom_control_mappings (
tenant_id, project_id, tom_control_code, tom_category,
canonical_control_id, canonical_control_code, canonical_category,
mapping_type, relevance_score
) VALUES (
:tid, :pid, :tom_code, :tom_cat,
CAST(:cc_id AS uuid), :cc_code, :cc_category,
'manual', :score
)
RETURNING *
"""),
{
"tid": tenant_id,
"pid": project_id,
"tom_code": body.tom_control_code,
"tom_cat": body.tom_category,
"cc_id": body.canonical_control_id,
"cc_code": body.canonical_control_code,
"cc_category": body.canonical_category or cc.category,
"score": body.relevance_score,
},
).fetchone()
db.commit()
except Exception as e:
if "unique" in str(e).lower() or "duplicate" in str(e).lower():
raise HTTPException(status_code=409, detail="Mapping already exists")
raise
return _mapping_row_to_dict(row)
# =============================================================================
# DELETE MAPPING
# =============================================================================
@router.delete("/{mapping_id}", status_code=204)
async def delete_mapping(
mapping_id: str,
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
):
"""Remove a mapping (manual or auto)."""
tenant_id = _get_tenant_id(x_tenant_id)
with SessionLocal() as db:
result = db.execute(
text("""
DELETE FROM tom_control_mappings
WHERE id = CAST(:mid AS uuid) AND tenant_id = :tid
"""),
{"mid": mapping_id, "tid": tenant_id},
)
if result.rowcount == 0:
raise HTTPException(status_code=404, detail="Mapping not found")
db.commit()
return None

View File

@@ -0,0 +1,427 @@
"""
FastAPI routes for VVT Master Libraries + Process Templates.
Library endpoints (read-only, global):
GET /vvt/libraries — Overview: all library types + counts
GET /vvt/libraries/data-subjects — Data subjects (filter: typical_for)
GET /vvt/libraries/data-categories — Hierarchical (filter: parent_id, is_art9, flat)
GET /vvt/libraries/recipients — Recipients (filter: type)
GET /vvt/libraries/legal-bases — Legal bases (filter: is_art9, type)
GET /vvt/libraries/retention-rules — Retention rules
GET /vvt/libraries/transfer-mechanisms — Transfer mechanisms
GET /vvt/libraries/purposes — Purposes (filter: typical_for)
GET /vvt/libraries/toms — TOMs (filter: category)
Template endpoints:
GET /vvt/templates — List templates (filter: business_function, search)
GET /vvt/templates/{id} — Single template with resolved labels
POST /vvt/templates/{id}/instantiate — Create VVT activity from template
"""
import logging
import uuid
from datetime import datetime
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from ..db.vvt_library_models import (
VVTLibDataSubjectDB,
VVTLibDataCategoryDB,
VVTLibRecipientDB,
VVTLibLegalBasisDB,
VVTLibRetentionRuleDB,
VVTLibTransferMechanismDB,
VVTLibPurposeDB,
VVTLibTomDB,
VVTProcessTemplateDB,
)
from ..db.vvt_models import VVTActivityDB, VVTAuditLogDB
from .tenant_utils import get_tenant_id
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/vvt", tags=["compliance-vvt-libraries"])
# ============================================================================
# Helper: row → dict
# ============================================================================
def _row_to_dict(row, extra_fields=None):
"""Generic row → dict for library items."""
d = {
"id": row.id,
"label_de": row.label_de,
}
if hasattr(row, 'description_de') and row.description_de:
d["description_de"] = row.description_de
if hasattr(row, 'sort_order'):
d["sort_order"] = row.sort_order
if extra_fields:
for f in extra_fields:
if hasattr(row, f):
val = getattr(row, f)
if val is not None:
d[f] = val
return d
# ============================================================================
# Library Overview
# ============================================================================
@router.get("/libraries")
async def get_libraries_overview(db: Session = Depends(get_db)):
"""Overview of all library types with item counts."""
return {
"libraries": [
{"type": "data-subjects", "count": db.query(VVTLibDataSubjectDB).count()},
{"type": "data-categories", "count": db.query(VVTLibDataCategoryDB).count()},
{"type": "recipients", "count": db.query(VVTLibRecipientDB).count()},
{"type": "legal-bases", "count": db.query(VVTLibLegalBasisDB).count()},
{"type": "retention-rules", "count": db.query(VVTLibRetentionRuleDB).count()},
{"type": "transfer-mechanisms", "count": db.query(VVTLibTransferMechanismDB).count()},
{"type": "purposes", "count": db.query(VVTLibPurposeDB).count()},
{"type": "toms", "count": db.query(VVTLibTomDB).count()},
]
}
# ============================================================================
# Data Subjects
# ============================================================================
@router.get("/libraries/data-subjects")
async def list_data_subjects(
typical_for: Optional[str] = Query(None, description="Filter by business function"),
db: Session = Depends(get_db),
):
query = db.query(VVTLibDataSubjectDB).order_by(VVTLibDataSubjectDB.sort_order)
rows = query.all()
items = [_row_to_dict(r, ["art9_relevant", "typical_for"]) for r in rows]
if typical_for:
items = [i for i in items if typical_for in (i.get("typical_for") or [])]
return items
# ============================================================================
# Data Categories (hierarchical)
# ============================================================================
@router.get("/libraries/data-categories")
async def list_data_categories(
flat: Optional[bool] = Query(False, description="Return flat list instead of tree"),
parent_id: Optional[str] = Query(None),
is_art9: Optional[bool] = Query(None),
db: Session = Depends(get_db),
):
query = db.query(VVTLibDataCategoryDB).order_by(VVTLibDataCategoryDB.sort_order)
if parent_id is not None:
query = query.filter(VVTLibDataCategoryDB.parent_id == parent_id)
if is_art9 is not None:
query = query.filter(VVTLibDataCategoryDB.is_art9 == is_art9)
rows = query.all()
extra = ["parent_id", "is_art9", "is_art10", "risk_weight", "default_retention_rule", "default_legal_basis"]
items = [_row_to_dict(r, extra) for r in rows]
if flat or parent_id is not None or is_art9 is not None:
return items
# Build tree
by_parent: dict = {}
for item in items:
pid = item.get("parent_id")
by_parent.setdefault(pid, []).append(item)
tree = []
for item in by_parent.get(None, []):
children = by_parent.get(item["id"], [])
if children:
item["children"] = children
tree.append(item)
return tree
# ============================================================================
# Recipients
# ============================================================================
@router.get("/libraries/recipients")
async def list_recipients(
type: Optional[str] = Query(None, description="INTERNAL, PROCESSOR, CONTROLLER, AUTHORITY"),
db: Session = Depends(get_db),
):
query = db.query(VVTLibRecipientDB).order_by(VVTLibRecipientDB.sort_order)
if type:
query = query.filter(VVTLibRecipientDB.type == type)
rows = query.all()
return [_row_to_dict(r, ["type", "is_third_country", "country"]) for r in rows]
# ============================================================================
# Legal Bases
# ============================================================================
@router.get("/libraries/legal-bases")
async def list_legal_bases(
is_art9: Optional[bool] = Query(None),
type: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
query = db.query(VVTLibLegalBasisDB).order_by(VVTLibLegalBasisDB.sort_order)
if is_art9 is not None:
query = query.filter(VVTLibLegalBasisDB.is_art9 == is_art9)
if type:
query = query.filter(VVTLibLegalBasisDB.type == type)
rows = query.all()
return [_row_to_dict(r, ["article", "type", "is_art9", "typical_national_law"]) for r in rows]
# ============================================================================
# Retention Rules
# ============================================================================
@router.get("/libraries/retention-rules")
async def list_retention_rules(db: Session = Depends(get_db)):
rows = db.query(VVTLibRetentionRuleDB).order_by(VVTLibRetentionRuleDB.sort_order).all()
return [_row_to_dict(r, ["legal_basis", "duration", "duration_unit", "start_event", "deletion_procedure"]) for r in rows]
# ============================================================================
# Transfer Mechanisms
# ============================================================================
@router.get("/libraries/transfer-mechanisms")
async def list_transfer_mechanisms(db: Session = Depends(get_db)):
rows = db.query(VVTLibTransferMechanismDB).order_by(VVTLibTransferMechanismDB.sort_order).all()
return [_row_to_dict(r, ["article", "requires_tia"]) for r in rows]
# ============================================================================
# Purposes
# ============================================================================
@router.get("/libraries/purposes")
async def list_purposes(
typical_for: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
rows = db.query(VVTLibPurposeDB).order_by(VVTLibPurposeDB.sort_order).all()
items = [_row_to_dict(r, ["typical_legal_basis", "typical_for"]) for r in rows]
if typical_for:
items = [i for i in items if typical_for in (i.get("typical_for") or [])]
return items
# ============================================================================
# TOMs
# ============================================================================
@router.get("/libraries/toms")
async def list_toms(
category: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
query = db.query(VVTLibTomDB).order_by(VVTLibTomDB.sort_order)
if category:
query = query.filter(VVTLibTomDB.category == category)
rows = query.all()
return [_row_to_dict(r, ["category", "art32_reference"]) for r in rows]
# ============================================================================
# Process Templates
# ============================================================================
def _template_to_dict(t: VVTProcessTemplateDB) -> dict:
return {
"id": t.id,
"name": t.name,
"description": t.description,
"business_function": t.business_function,
"purpose_refs": t.purpose_refs or [],
"legal_basis_refs": t.legal_basis_refs or [],
"data_subject_refs": t.data_subject_refs or [],
"data_category_refs": t.data_category_refs or [],
"recipient_refs": t.recipient_refs or [],
"tom_refs": t.tom_refs or [],
"transfer_mechanism_refs": t.transfer_mechanism_refs or [],
"retention_rule_ref": t.retention_rule_ref,
"typical_systems": t.typical_systems or [],
"protection_level": t.protection_level or "MEDIUM",
"dpia_required": t.dpia_required or False,
"risk_score": t.risk_score,
"tags": t.tags or [],
"is_system": t.is_system,
"sort_order": t.sort_order,
}
def _resolve_labels(template_dict: dict, db: Session) -> dict:
"""Resolve library IDs to labels within the template dict."""
resolvers = {
"purpose_refs": (VVTLibPurposeDB, "purpose_labels"),
"legal_basis_refs": (VVTLibLegalBasisDB, "legal_basis_labels"),
"data_subject_refs": (VVTLibDataSubjectDB, "data_subject_labels"),
"data_category_refs": (VVTLibDataCategoryDB, "data_category_labels"),
"recipient_refs": (VVTLibRecipientDB, "recipient_labels"),
"tom_refs": (VVTLibTomDB, "tom_labels"),
"transfer_mechanism_refs": (VVTLibTransferMechanismDB, "transfer_mechanism_labels"),
}
for refs_key, (model, labels_key) in resolvers.items():
ids = template_dict.get(refs_key) or []
if ids:
rows = db.query(model).filter(model.id.in_(ids)).all()
label_map = {r.id: r.label_de for r in rows}
template_dict[labels_key] = {rid: label_map.get(rid, rid) for rid in ids}
# Resolve single retention rule
rr = template_dict.get("retention_rule_ref")
if rr:
row = db.query(VVTLibRetentionRuleDB).filter(VVTLibRetentionRuleDB.id == rr).first()
if row:
template_dict["retention_rule_label"] = row.label_de
return template_dict
@router.get("/templates")
async def list_templates(
business_function: Optional[str] = Query(None),
search: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
"""List process templates (system + tenant)."""
query = db.query(VVTProcessTemplateDB).order_by(VVTProcessTemplateDB.sort_order)
if business_function:
query = query.filter(VVTProcessTemplateDB.business_function == business_function)
if search:
term = f"%{search}%"
query = query.filter(
(VVTProcessTemplateDB.name.ilike(term)) |
(VVTProcessTemplateDB.description.ilike(term))
)
templates = query.all()
return [_template_to_dict(t) for t in templates]
@router.get("/templates/{template_id}")
async def get_template(
template_id: str,
db: Session = Depends(get_db),
):
"""Get a single template with resolved library labels."""
t = db.query(VVTProcessTemplateDB).filter(VVTProcessTemplateDB.id == template_id).first()
if not t:
raise HTTPException(status_code=404, detail=f"Template '{template_id}' not found")
result = _template_to_dict(t)
return _resolve_labels(result, db)
@router.post("/templates/{template_id}/instantiate", status_code=201)
async def instantiate_template(
template_id: str,
http_request: Request,
tid: str = Depends(get_tenant_id),
db: Session = Depends(get_db),
):
"""Create a new VVT activity from a process template."""
t = db.query(VVTProcessTemplateDB).filter(VVTProcessTemplateDB.id == template_id).first()
if not t:
raise HTTPException(status_code=404, detail=f"Template '{template_id}' not found")
# Generate unique VVT-ID
count = db.query(VVTActivityDB).filter(VVTActivityDB.tenant_id == tid).count()
vvt_id = f"VVT-{count + 1:04d}"
# Resolve library IDs to freetext labels for backward-compat fields
purpose_labels = _resolve_ids(db, VVTLibPurposeDB, t.purpose_refs or [])
legal_labels = _resolve_ids(db, VVTLibLegalBasisDB, t.legal_basis_refs or [])
subject_labels = _resolve_ids(db, VVTLibDataSubjectDB, t.data_subject_refs or [])
category_labels = _resolve_ids(db, VVTLibDataCategoryDB, t.data_category_refs or [])
recipient_labels = _resolve_ids(db, VVTLibRecipientDB, t.recipient_refs or [])
# Resolve retention rule
retention_period = {}
if t.retention_rule_ref:
rr = db.query(VVTLibRetentionRuleDB).filter(VVTLibRetentionRuleDB.id == t.retention_rule_ref).first()
if rr:
retention_period = {
"description": rr.label_de,
"legalBasis": rr.legal_basis or "",
"deletionProcedure": rr.deletion_procedure or "",
"duration": rr.duration,
"durationUnit": rr.duration_unit,
}
# Build structured TOMs from tom_refs
structured_toms = {"accessControl": [], "confidentiality": [], "integrity": [], "availability": [], "separation": []}
if t.tom_refs:
tom_rows = db.query(VVTLibTomDB).filter(VVTLibTomDB.id.in_(t.tom_refs)).all()
for tr in tom_rows:
cat = tr.category
if cat in structured_toms:
structured_toms[cat].append(tr.label_de)
act = VVTActivityDB(
tenant_id=tid,
vvt_id=vvt_id,
name=t.name,
description=t.description or "",
purposes=purpose_labels,
legal_bases=[{"type": lid, "description": lbl} for lid, lbl in zip(t.legal_basis_refs or [], legal_labels)],
data_subject_categories=subject_labels,
personal_data_categories=category_labels,
recipient_categories=[{"type": "unknown", "name": lbl} for lbl in recipient_labels],
retention_period=retention_period,
business_function=t.business_function,
systems=[{"systemId": s, "name": s} for s in (t.typical_systems or [])],
protection_level=t.protection_level or "MEDIUM",
dpia_required=t.dpia_required or False,
structured_toms=structured_toms,
status="DRAFT",
created_by=http_request.headers.get("X-User-ID", "system"),
# Library refs
purpose_refs=t.purpose_refs,
legal_basis_refs=t.legal_basis_refs,
data_subject_refs=t.data_subject_refs,
data_category_refs=t.data_category_refs,
recipient_refs=t.recipient_refs,
retention_rule_ref=t.retention_rule_ref,
transfer_mechanism_refs=t.transfer_mechanism_refs,
tom_refs=t.tom_refs,
source_template_id=t.id,
risk_score=t.risk_score,
)
db.add(act)
db.flush()
# Audit log
audit = VVTAuditLogDB(
tenant_id=tid,
action="CREATE",
entity_type="activity",
entity_id=act.id,
changed_by=http_request.headers.get("X-User-ID", "system"),
new_values={"vvt_id": vvt_id, "source_template_id": t.id, "name": t.name},
)
db.add(audit)
db.commit()
db.refresh(act)
# Return full response
from .vvt_routes import _activity_to_response
return _activity_to_response(act)
def _resolve_ids(db: Session, model, ids: list) -> list:
"""Resolve list of library IDs to list of label_de strings."""
if not ids:
return []
rows = db.query(model).filter(model.id.in_(ids)).all()
label_map = {r.id: r.label_de for r in rows}
return [label_map.get(i, i) for i in ids]

View File

@@ -81,6 +81,54 @@ async def upsert_organization(
# Activities
# ============================================================================
def _activity_to_response(act: VVTActivityDB) -> VVTActivityResponse:
return VVTActivityResponse(
id=str(act.id),
vvt_id=act.vvt_id,
name=act.name,
description=act.description,
purposes=act.purposes or [],
legal_bases=act.legal_bases or [],
data_subject_categories=act.data_subject_categories or [],
personal_data_categories=act.personal_data_categories or [],
recipient_categories=act.recipient_categories or [],
third_country_transfers=act.third_country_transfers or [],
retention_period=act.retention_period or {},
tom_description=act.tom_description,
business_function=act.business_function,
systems=act.systems or [],
deployment_model=act.deployment_model,
data_sources=act.data_sources or [],
data_flows=act.data_flows or [],
protection_level=act.protection_level or 'MEDIUM',
dpia_required=act.dpia_required or False,
structured_toms=act.structured_toms or {},
status=act.status or 'DRAFT',
responsible=act.responsible,
owner=act.owner,
last_reviewed_at=act.last_reviewed_at,
next_review_at=act.next_review_at,
created_by=act.created_by,
dsfa_id=str(act.dsfa_id) if act.dsfa_id else None,
# Library refs
purpose_refs=act.purpose_refs,
legal_basis_refs=act.legal_basis_refs,
data_subject_refs=act.data_subject_refs,
data_category_refs=act.data_category_refs,
recipient_refs=act.recipient_refs,
retention_rule_ref=act.retention_rule_ref,
transfer_mechanism_refs=act.transfer_mechanism_refs,
tom_refs=act.tom_refs,
source_template_id=act.source_template_id,
risk_score=act.risk_score,
linked_loeschfristen_ids=act.linked_loeschfristen_ids,
linked_tom_measure_ids=act.linked_tom_measure_ids,
art30_completeness=act.art30_completeness,
created_at=act.created_at,
updated_at=act.updated_at,
)
@router.get("/activities", response_model=List[VVTActivityResponse])
async def list_activities(
status: Optional[str] = Query(None),
@@ -145,6 +193,107 @@ async def delete_activity(
return service.delete_activity(tid, activity_id)
# ============================================================================
# Art. 30 Completeness Check
# ============================================================================
@router.get("/activities/{activity_id}/completeness")
async def get_activity_completeness(
activity_id: str,
tid: str = Depends(get_tenant_id),
db: Session = Depends(get_db),
):
"""Calculate Art. 30 completeness score for a VVT activity."""
act = db.query(VVTActivityDB).filter(
VVTActivityDB.id == activity_id,
VVTActivityDB.tenant_id == tid,
).first()
if not act:
raise HTTPException(status_code=404, detail=f"Activity {activity_id} not found")
return _calculate_completeness(act)
def _calculate_completeness(act: VVTActivityDB) -> dict:
"""Calculate Art. 30 completeness — required fields per DSGVO Art. 30 Abs. 1."""
missing = []
warnings = []
total_checks = 10
passed = 0
# 1. Name/Zweck
if act.name:
passed += 1
else:
missing.append("name")
# 2. Verarbeitungszwecke
has_purposes = bool(act.purposes) or bool(act.purpose_refs)
if has_purposes:
passed += 1
else:
missing.append("purposes")
# 3. Rechtsgrundlage
has_legal = bool(act.legal_bases) or bool(act.legal_basis_refs)
if has_legal:
passed += 1
else:
missing.append("legal_bases")
# 4. Betroffenenkategorien
has_subjects = bool(act.data_subject_categories) or bool(act.data_subject_refs)
if has_subjects:
passed += 1
else:
missing.append("data_subjects")
# 5. Datenkategorien
has_categories = bool(act.personal_data_categories) or bool(act.data_category_refs)
if has_categories:
passed += 1
else:
missing.append("data_categories")
# 6. Empfaenger
has_recipients = bool(act.recipient_categories) or bool(act.recipient_refs)
if has_recipients:
passed += 1
else:
missing.append("recipients")
# 7. Drittland-Uebermittlung (checked but not strictly required)
passed += 1 # always passes — no transfer is valid state
# 8. Loeschfristen
has_retention = bool(act.retention_period and act.retention_period.get('description')) or bool(act.retention_rule_ref)
if has_retention:
passed += 1
else:
missing.append("retention_period")
# 9. TOM-Beschreibung
has_tom = bool(act.tom_description) or bool(act.tom_refs) or bool(act.structured_toms)
if has_tom:
passed += 1
else:
missing.append("tom_description")
# 10. Verantwortlicher
if act.responsible:
passed += 1
else:
missing.append("responsible")
# Warnings
if act.dpia_required and not act.dsfa_id:
warnings.append("dpia_required_but_no_dsfa_linked")
if act.third_country_transfers and not act.transfer_mechanism_refs:
warnings.append("third_country_transfer_without_mechanism")
score = int((passed / total_checks) * 100)
return {"score": score, "missing": missing, "warnings": warnings, "passed": passed, "total": total_checks}
# ============================================================================
# Audit Log
# ============================================================================

View File

@@ -0,0 +1,443 @@
{
"framework_id": "CSA_CCM",
"display_name": "Cloud Security Alliance CCM v4",
"license": {
"type": "restricted",
"rag_allowed": false,
"use_as_metadata": true,
"note": "Abstrahierte Struktur — keine Originaltexte uebernommen"
},
"domains": [
{
"domain_id": "AIS",
"title": "Application and Interface Security",
"aliases": ["ais", "application and interface security", "anwendungssicherheit", "schnittstellensicherheit"],
"keywords": ["application", "anwendung", "interface", "schnittstelle", "api", "web", "eingabevalidierung"],
"subcontrols": [
{
"subcontrol_id": "AIS-01",
"title": "Application Security Policy",
"statement": "Sicherheitsrichtlinien fuer Anwendungsentwicklung und Schnittstellenmanagement muessen definiert und angewendet werden.",
"keywords": ["policy", "richtlinie", "entwicklung"],
"action_hint": "document",
"object_hint": "Anwendungssicherheitsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "AIS-02",
"title": "Application Security Design",
"statement": "Sicherheitsanforderungen muessen in den Entwurf jeder Anwendung integriert werden.",
"keywords": ["design", "entwurf", "security by design"],
"action_hint": "implement",
"object_hint": "Sicherheitsanforderungen im Anwendungsentwurf",
"object_class": "process"
},
{
"subcontrol_id": "AIS-03",
"title": "Application Security Testing",
"statement": "Anwendungen muessen vor dem Deployment und regelmaessig auf Sicherheitsschwachstellen getestet werden.",
"keywords": ["testing", "test", "sast", "dast", "penetration"],
"action_hint": "test",
"object_hint": "Anwendungssicherheitstests",
"object_class": "process"
},
{
"subcontrol_id": "AIS-04",
"title": "Secure Development Practices",
"statement": "Sichere Entwicklungspraktiken (Code Review, Pair Programming, SAST) muessen fuer alle Entwicklungsprojekte gelten.",
"keywords": ["development", "entwicklung", "code review", "sast", "praktiken"],
"action_hint": "implement",
"object_hint": "Sichere Entwicklungspraktiken",
"object_class": "process"
},
{
"subcontrol_id": "AIS-05",
"title": "API Security",
"statement": "APIs muessen authentifiziert, autorisiert und gegen Missbrauch geschuetzt werden.",
"keywords": ["api", "schnittstelle", "authentifizierung", "rate limiting"],
"action_hint": "implement",
"object_hint": "API-Sicherheitskontrollen",
"object_class": "interface"
},
{
"subcontrol_id": "AIS-06",
"title": "Automated Application Security Testing",
"statement": "Automatisierte Sicherheitstests muessen in die CI/CD-Pipeline integriert werden.",
"keywords": ["automatisiert", "ci/cd", "pipeline", "sast", "dast"],
"action_hint": "configure",
"object_hint": "Automatisierte Sicherheitstests in CI/CD",
"object_class": "configuration"
}
]
},
{
"domain_id": "BCR",
"title": "Business Continuity and Resilience",
"aliases": ["bcr", "business continuity", "resilience", "geschaeftskontinuitaet", "resilienz"],
"keywords": ["continuity", "kontinuitaet", "resilience", "resilienz", "disaster", "recovery", "backup"],
"subcontrols": [
{
"subcontrol_id": "BCR-01",
"title": "Business Continuity Planning",
"statement": "Ein Geschaeftskontinuitaetsplan muss erstellt, dokumentiert und regelmaessig getestet werden.",
"keywords": ["plan", "kontinuitaet", "geschaeft"],
"action_hint": "document",
"object_hint": "Geschaeftskontinuitaetsplan",
"object_class": "policy"
},
{
"subcontrol_id": "BCR-02",
"title": "Risk Assessment for BCM",
"statement": "Risikobewertungen muessen fuer geschaeftskritische Prozesse durchgefuehrt werden.",
"keywords": ["risiko", "bewertung", "kritisch"],
"action_hint": "assess",
"object_hint": "BCM-Risikobewertung",
"object_class": "risk_artifact"
},
{
"subcontrol_id": "BCR-03",
"title": "Backup and Recovery",
"statement": "Datensicherungen muessen regelmaessig erstellt und Wiederherstellungstests durchgefuehrt werden.",
"keywords": ["backup", "sicherung", "wiederherstellung", "recovery"],
"action_hint": "maintain",
"object_hint": "Datensicherung und Wiederherstellung",
"object_class": "technical_control"
},
{
"subcontrol_id": "BCR-04",
"title": "Disaster Recovery Planning",
"statement": "Ein Disaster-Recovery-Plan muss dokumentiert und jaehrlich getestet werden.",
"keywords": ["disaster", "recovery", "katastrophe"],
"action_hint": "document",
"object_hint": "Disaster-Recovery-Plan",
"object_class": "policy"
}
]
},
{
"domain_id": "CCC",
"title": "Change Control and Configuration Management",
"aliases": ["ccc", "change control", "configuration management", "aenderungsmanagement", "konfigurationsmanagement"],
"keywords": ["change", "aenderung", "konfiguration", "configuration", "release", "deployment"],
"subcontrols": [
{
"subcontrol_id": "CCC-01",
"title": "Change Management Policy",
"statement": "Ein Aenderungsmanagement-Prozess muss definiert und fuer alle Aenderungen angewendet werden.",
"keywords": ["policy", "richtlinie", "aenderung"],
"action_hint": "document",
"object_hint": "Aenderungsmanagement-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "CCC-02",
"title": "Change Testing",
"statement": "Aenderungen muessen vor der Produktivsetzung getestet und genehmigt werden.",
"keywords": ["test", "genehmigung", "approval"],
"action_hint": "test",
"object_hint": "Aenderungstests",
"object_class": "process"
},
{
"subcontrol_id": "CCC-03",
"title": "Configuration Baseline",
"statement": "Basiskonfigurationen fuer alle Systeme muessen definiert und dokumentiert werden.",
"keywords": ["baseline", "basis", "standard"],
"action_hint": "define",
"object_hint": "Konfigurationsbaseline",
"object_class": "configuration"
}
]
},
{
"domain_id": "CEK",
"title": "Cryptography, Encryption and Key Management",
"aliases": ["cek", "cryptography", "encryption", "key management", "kryptographie", "verschluesselung", "schluesselverwaltung"],
"keywords": ["kryptographie", "verschluesselung", "schluessel", "key", "encryption", "certificate", "zertifikat"],
"subcontrols": [
{
"subcontrol_id": "CEK-01",
"title": "Encryption Policy",
"statement": "Verschluesselungsrichtlinien muessen definiert werden, die Algorithmen, Schluessellaengen und Einsatzbereiche festlegen.",
"keywords": ["policy", "richtlinie", "algorithmus"],
"action_hint": "document",
"object_hint": "Verschluesselungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "CEK-02",
"title": "Key Management",
"statement": "Kryptographische Schluessel muessen ueber ihren Lebenszyklus sicher verwaltet werden.",
"keywords": ["key", "schluessel", "management", "lebenszyklus"],
"action_hint": "maintain",
"object_hint": "Schluesselverwaltung",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "CEK-03",
"title": "Data Encryption",
"statement": "Sensible Daten muessen bei Speicherung und Uebertragung verschluesselt werden.",
"keywords": ["data", "daten", "speicherung", "uebertragung"],
"action_hint": "encrypt",
"object_hint": "Datenverschluesselung",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "DSP",
"title": "Data Security and Privacy",
"aliases": ["dsp", "data security", "privacy", "datensicherheit", "datenschutz"],
"keywords": ["datenschutz", "datensicherheit", "privacy", "data security", "pii", "personenbezogen", "dsgvo"],
"subcontrols": [
{
"subcontrol_id": "DSP-01",
"title": "Data Classification",
"statement": "Daten muessen nach Sensibilitaet klassifiziert und entsprechend geschuetzt werden.",
"keywords": ["klassifizierung", "sensibilitaet", "classification"],
"action_hint": "define",
"object_hint": "Datenklassifizierung",
"object_class": "data"
},
{
"subcontrol_id": "DSP-02",
"title": "Data Inventory",
"statement": "Ein Dateninventar muss gefuehrt werden, das alle Verarbeitungen personenbezogener Daten dokumentiert.",
"keywords": ["inventar", "verzeichnis", "verarbeitung", "vvt"],
"action_hint": "maintain",
"object_hint": "Dateninventar",
"object_class": "register"
},
{
"subcontrol_id": "DSP-03",
"title": "Data Retention and Deletion",
"statement": "Aufbewahrungsfristen muessen definiert und Daten nach Ablauf sicher geloescht werden.",
"keywords": ["retention", "aufbewahrung", "loeschung", "frist"],
"action_hint": "delete",
"object_hint": "Datenloeschung nach Frist",
"object_class": "data"
},
{
"subcontrol_id": "DSP-04",
"title": "Privacy Impact Assessment",
"statement": "Datenschutz-Folgenabschaetzungen muessen fuer risikoreiche Verarbeitungen durchgefuehrt werden.",
"keywords": ["dsfa", "pia", "folgenabschaetzung", "impact"],
"action_hint": "assess",
"object_hint": "Datenschutz-Folgenabschaetzung",
"object_class": "risk_artifact"
},
{
"subcontrol_id": "DSP-05",
"title": "Data Subject Rights",
"statement": "Verfahren zur Bearbeitung von Betroffenenrechten muessen implementiert werden.",
"keywords": ["betroffenenrechte", "auskunft", "loeschung", "data subject"],
"action_hint": "implement",
"object_hint": "Betroffenenrechte-Verfahren",
"object_class": "process"
}
]
},
{
"domain_id": "GRC",
"title": "Governance, Risk and Compliance",
"aliases": ["grc", "governance", "risk", "compliance", "risikomanagement"],
"keywords": ["governance", "risiko", "compliance", "management", "policy", "richtlinie"],
"subcontrols": [
{
"subcontrol_id": "GRC-01",
"title": "Information Security Program",
"statement": "Ein umfassendes Informationssicherheitsprogramm muss etabliert und aufrechterhalten werden.",
"keywords": ["programm", "sicherheit", "information"],
"action_hint": "maintain",
"object_hint": "Informationssicherheitsprogramm",
"object_class": "policy"
},
{
"subcontrol_id": "GRC-02",
"title": "Risk Management Program",
"statement": "Ein Risikomanagement-Programm muss implementiert werden, das Identifikation, Bewertung und Behandlung umfasst.",
"keywords": ["risiko", "management", "bewertung", "behandlung"],
"action_hint": "implement",
"object_hint": "Risikomanagement-Programm",
"object_class": "process"
},
{
"subcontrol_id": "GRC-03",
"title": "Compliance Monitoring",
"statement": "Die Einhaltung regulatorischer und vertraglicher Anforderungen muss ueberwacht werden.",
"keywords": ["compliance", "einhaltung", "regulatorisch", "ueberwachung"],
"action_hint": "monitor",
"object_hint": "Compliance-Ueberwachung",
"object_class": "process"
}
]
},
{
"domain_id": "IAM",
"title": "Identity and Access Management",
"aliases": ["iam", "identity", "access management", "identitaetsmanagement", "zugriffsverwaltung"],
"keywords": ["identitaet", "zugriff", "identity", "access", "authentifizierung", "autorisierung", "sso"],
"subcontrols": [
{
"subcontrol_id": "IAM-01",
"title": "Identity and Access Policy",
"statement": "Identitaets- und Zugriffsmanagement-Richtlinien muessen definiert werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "IAM-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "IAM-02",
"title": "Strong Authentication",
"statement": "Starke Authentifizierung (MFA) muss fuer administrative und sicherheitskritische Zugriffe gefordert werden.",
"keywords": ["mfa", "stark", "authentifizierung", "admin"],
"action_hint": "implement",
"object_hint": "Starke Authentifizierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "IAM-03",
"title": "Identity Lifecycle Management",
"statement": "Identitaeten muessen ueber ihren gesamten Lebenszyklus verwaltet werden.",
"keywords": ["lifecycle", "lebenszyklus", "onboarding", "offboarding"],
"action_hint": "maintain",
"object_hint": "Identitaets-Lebenszyklus",
"object_class": "account"
},
{
"subcontrol_id": "IAM-04",
"title": "Access Review",
"statement": "Zugriffsrechte muessen regelmaessig ueberprueft und ueberschuessige Rechte entzogen werden.",
"keywords": ["review", "ueberpruefen", "rechte", "rezertifizierung"],
"action_hint": "review",
"object_hint": "Zugriffsrechte-Review",
"object_class": "access_control"
}
]
},
{
"domain_id": "LOG",
"title": "Logging and Monitoring",
"aliases": ["log", "logging", "monitoring", "protokollierung", "ueberwachung"],
"keywords": ["logging", "monitoring", "protokollierung", "ueberwachung", "siem", "alarm"],
"subcontrols": [
{
"subcontrol_id": "LOG-01",
"title": "Logging Policy",
"statement": "Protokollierungs-Richtlinien muessen definiert werden, die Umfang und Aufbewahrung festlegen.",
"keywords": ["policy", "richtlinie", "umfang", "aufbewahrung"],
"action_hint": "document",
"object_hint": "Protokollierungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "LOG-02",
"title": "Security Event Logging",
"statement": "Sicherheitsrelevante Ereignisse muessen erfasst und zentral gespeichert werden.",
"keywords": ["event", "ereignis", "sicherheit", "zentral"],
"action_hint": "configure",
"object_hint": "Sicherheits-Event-Logging",
"object_class": "configuration"
},
{
"subcontrol_id": "LOG-03",
"title": "Monitoring and Alerting",
"statement": "Sicherheitsrelevante Logs muessen ueberwacht und bei Anomalien Alarme ausgeloest werden.",
"keywords": ["monitoring", "alerting", "alarm", "anomalie"],
"action_hint": "monitor",
"object_hint": "Log-Ueberwachung und Alarmierung",
"object_class": "technical_control"
}
]
},
{
"domain_id": "SEF",
"title": "Security Incident Management",
"aliases": ["sef", "security incident", "incident management", "vorfallmanagement", "sicherheitsvorfall"],
"keywords": ["vorfall", "incident", "sicherheitsvorfall", "reaktion", "response", "meldung"],
"subcontrols": [
{
"subcontrol_id": "SEF-01",
"title": "Incident Management Policy",
"statement": "Ein Vorfallmanagement-Prozess muss definiert, dokumentiert und getestet werden.",
"keywords": ["policy", "richtlinie", "prozess"],
"action_hint": "document",
"object_hint": "Vorfallmanagement-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "SEF-02",
"title": "Incident Response Team",
"statement": "Ein Incident-Response-Team muss benannt und geschult werden.",
"keywords": ["team", "response", "schulung"],
"action_hint": "define",
"object_hint": "Incident-Response-Team",
"object_class": "role"
},
{
"subcontrol_id": "SEF-03",
"title": "Incident Reporting",
"statement": "Sicherheitsvorfaelle muessen innerhalb definierter Fristen an zustaendige Stellen gemeldet werden.",
"keywords": ["reporting", "meldung", "frist", "behoerde"],
"action_hint": "report",
"object_hint": "Vorfallmeldung",
"object_class": "incident"
},
{
"subcontrol_id": "SEF-04",
"title": "Incident Lessons Learned",
"statement": "Nach jedem Vorfall muss eine Nachbereitung mit Lessons Learned durchgefuehrt werden.",
"keywords": ["lessons learned", "nachbereitung", "verbesserung"],
"action_hint": "review",
"object_hint": "Vorfall-Nachbereitung",
"object_class": "record"
}
]
},
{
"domain_id": "TVM",
"title": "Threat and Vulnerability Management",
"aliases": ["tvm", "threat", "vulnerability", "schwachstelle", "bedrohung", "schwachstellenmanagement"],
"keywords": ["schwachstelle", "vulnerability", "threat", "bedrohung", "patch", "scan"],
"subcontrols": [
{
"subcontrol_id": "TVM-01",
"title": "Vulnerability Management Policy",
"statement": "Schwachstellenmanagement-Richtlinien muessen definiert und umgesetzt werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Schwachstellenmanagement-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "TVM-02",
"title": "Vulnerability Scanning",
"statement": "Systeme muessen regelmaessig auf Schwachstellen gescannt werden.",
"keywords": ["scan", "scanning", "regelmaessig"],
"action_hint": "test",
"object_hint": "Schwachstellenscan",
"object_class": "system"
},
{
"subcontrol_id": "TVM-03",
"title": "Vulnerability Remediation",
"statement": "Erkannte Schwachstellen muessen priorisiert und innerhalb definierter Fristen behoben werden.",
"keywords": ["remediation", "behebung", "frist", "priorisierung"],
"action_hint": "remediate",
"object_hint": "Schwachstellenbehebung",
"object_class": "system"
},
{
"subcontrol_id": "TVM-04",
"title": "Penetration Testing",
"statement": "Regelmaessige Penetrationstests muessen durchgefuehrt werden.",
"keywords": ["penetration", "pentest", "test"],
"action_hint": "test",
"object_hint": "Penetrationstest",
"object_class": "system"
}
]
}
]
}

View File

@@ -0,0 +1,514 @@
{
"framework_id": "NIST_SP800_53",
"display_name": "NIST SP 800-53 Rev. 5",
"license": {
"type": "public_domain",
"rag_allowed": true,
"use_as_metadata": true
},
"domains": [
{
"domain_id": "AC",
"title": "Access Control",
"aliases": ["access control", "zugriffskontrolle", "zugriffssteuerung"],
"keywords": ["access", "zugriff", "berechtigung", "authorization", "autorisierung"],
"subcontrols": [
{
"subcontrol_id": "AC-1",
"title": "Access Control Policy and Procedures",
"statement": "Zugriffskontrollrichtlinien und -verfahren muessen definiert, dokumentiert und regelmaessig ueberprueft werden.",
"keywords": ["policy", "richtlinie", "verfahren", "procedures"],
"action_hint": "document",
"object_hint": "Zugriffskontrollrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "AC-2",
"title": "Account Management",
"statement": "Benutzerkonten muessen ueber ihren gesamten Lebenszyklus verwaltet werden: Erstellung, Aktivierung, Aenderung, Deaktivierung und Loeschung.",
"keywords": ["account", "konto", "benutzer", "lifecycle", "lebenszyklus"],
"action_hint": "maintain",
"object_hint": "Benutzerkontenverwaltung",
"object_class": "account"
},
{
"subcontrol_id": "AC-3",
"title": "Access Enforcement",
"statement": "Der Zugriff auf Systemressourcen muss gemaess der definierten Zugriffskontrollrichtlinie durchgesetzt werden.",
"keywords": ["enforcement", "durchsetzung", "ressourcen", "system"],
"action_hint": "restrict_access",
"object_hint": "Zugriffsdurchsetzung",
"object_class": "access_control"
},
{
"subcontrol_id": "AC-5",
"title": "Separation of Duties",
"statement": "Aufgabentrennung muss definiert und durchgesetzt werden, um Interessenkonflikte und Missbrauch zu verhindern.",
"keywords": ["separation", "trennung", "duties", "aufgaben", "funktionstrennung"],
"action_hint": "define",
"object_hint": "Aufgabentrennung",
"object_class": "role"
},
{
"subcontrol_id": "AC-6",
"title": "Least Privilege",
"statement": "Zugriffsrechte muessen nach dem Prinzip der minimalen Rechte vergeben werden.",
"keywords": ["least privilege", "minimal", "rechte", "privileg"],
"action_hint": "restrict_access",
"object_hint": "Minimale Rechtevergabe",
"object_class": "access_control"
},
{
"subcontrol_id": "AC-7",
"title": "Unsuccessful Logon Attempts",
"statement": "Fehlgeschlagene Anmeldeversuche muessen begrenzt und ueberwacht werden.",
"keywords": ["logon", "anmeldung", "fehlgeschlagen", "sperre", "lockout"],
"action_hint": "monitor",
"object_hint": "Anmeldeversuchsueberwachung",
"object_class": "technical_control"
},
{
"subcontrol_id": "AC-17",
"title": "Remote Access",
"statement": "Fernzugriff muss autorisiert, ueberwacht und verschluesselt werden.",
"keywords": ["remote", "fern", "vpn", "fernzugriff"],
"action_hint": "configure",
"object_hint": "Fernzugriffskonfiguration",
"object_class": "technical_control"
}
]
},
{
"domain_id": "AU",
"title": "Audit and Accountability",
"aliases": ["audit", "protokollierung", "accountability", "rechenschaftspflicht"],
"keywords": ["audit", "log", "protokoll", "nachvollziehbarkeit", "logging"],
"subcontrols": [
{
"subcontrol_id": "AU-1",
"title": "Audit Policy and Procedures",
"statement": "Audit- und Protokollierungsrichtlinien muessen definiert und regelmaessig ueberprueft werden.",
"keywords": ["policy", "richtlinie", "audit"],
"action_hint": "document",
"object_hint": "Auditrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "AU-2",
"title": "Event Logging",
"statement": "Sicherheitsrelevante Ereignisse muessen identifiziert und protokolliert werden.",
"keywords": ["event", "ereignis", "logging", "protokollierung"],
"action_hint": "configure",
"object_hint": "Ereignisprotokollierung",
"object_class": "configuration"
},
{
"subcontrol_id": "AU-3",
"title": "Content of Audit Records",
"statement": "Audit-Eintraege muessen ausreichende Informationen enthalten: Zeitstempel, Quelle, Ergebnis, Identitaet.",
"keywords": ["content", "inhalt", "record", "eintrag"],
"action_hint": "define",
"object_hint": "Audit-Eintragsformat",
"object_class": "record"
},
{
"subcontrol_id": "AU-6",
"title": "Audit Record Review and Reporting",
"statement": "Audit-Eintraege muessen regelmaessig ueberprueft und bei Anomalien berichtet werden.",
"keywords": ["review", "ueberpruefen", "reporting", "anomalie"],
"action_hint": "review",
"object_hint": "Audit-Ueberpruefung",
"object_class": "record"
},
{
"subcontrol_id": "AU-9",
"title": "Protection of Audit Information",
"statement": "Audit-Daten muessen vor unbefugtem Zugriff, Aenderung und Loeschung geschuetzt werden.",
"keywords": ["schutz", "protection", "integritaet", "integrity"],
"action_hint": "implement",
"object_hint": "Audit-Datenschutz",
"object_class": "technical_control"
}
]
},
{
"domain_id": "AT",
"title": "Awareness and Training",
"aliases": ["awareness", "training", "schulung", "sensibilisierung"],
"keywords": ["training", "schulung", "awareness", "sensibilisierung", "weiterbildung"],
"subcontrols": [
{
"subcontrol_id": "AT-1",
"title": "Policy and Procedures",
"statement": "Schulungs- und Sensibilisierungsrichtlinien muessen definiert und regelmaessig aktualisiert werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Schulungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "AT-2",
"title": "Literacy Training and Awareness",
"statement": "Alle Mitarbeiter muessen regelmaessig Sicherheitsschulungen erhalten.",
"keywords": ["mitarbeiter", "schulung", "sicherheit"],
"action_hint": "train",
"object_hint": "Sicherheitsschulung",
"object_class": "training"
},
{
"subcontrol_id": "AT-3",
"title": "Role-Based Training",
"statement": "Rollenbasierte Sicherheitsschulungen muessen fuer Mitarbeiter mit besonderen Sicherheitsaufgaben durchgefuehrt werden.",
"keywords": ["rollenbasiert", "role-based", "speziell"],
"action_hint": "train",
"object_hint": "Rollenbasierte Sicherheitsschulung",
"object_class": "training"
}
]
},
{
"domain_id": "CM",
"title": "Configuration Management",
"aliases": ["configuration management", "konfigurationsmanagement", "konfiguration"],
"keywords": ["konfiguration", "configuration", "baseline", "haertung", "hardening"],
"subcontrols": [
{
"subcontrol_id": "CM-1",
"title": "Policy and Procedures",
"statement": "Konfigurationsmanagement-Richtlinien muessen dokumentiert und gepflegt werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Konfigurationsmanagement-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "CM-2",
"title": "Baseline Configuration",
"statement": "Basiskonfigurationen fuer Systeme muessen definiert, dokumentiert und gepflegt werden.",
"keywords": ["baseline", "basis", "standard"],
"action_hint": "define",
"object_hint": "Basiskonfiguration",
"object_class": "configuration"
},
{
"subcontrol_id": "CM-6",
"title": "Configuration Settings",
"statement": "Sicherheitsrelevante Konfigurationseinstellungen muessen definiert und durchgesetzt werden.",
"keywords": ["settings", "einstellungen", "sicherheit"],
"action_hint": "configure",
"object_hint": "Sicherheitskonfiguration",
"object_class": "configuration"
},
{
"subcontrol_id": "CM-7",
"title": "Least Functionality",
"statement": "Systeme muessen so konfiguriert werden, dass nur notwendige Funktionen aktiv sind.",
"keywords": ["least functionality", "minimal", "dienste", "ports"],
"action_hint": "configure",
"object_hint": "Minimalkonfiguration",
"object_class": "configuration"
},
{
"subcontrol_id": "CM-8",
"title": "System Component Inventory",
"statement": "Ein Inventar aller Systemkomponenten muss gefuehrt und aktuell gehalten werden.",
"keywords": ["inventar", "inventory", "komponenten", "assets"],
"action_hint": "maintain",
"object_hint": "Systemkomponenten-Inventar",
"object_class": "register"
}
]
},
{
"domain_id": "IA",
"title": "Identification and Authentication",
"aliases": ["identification", "authentication", "identifikation", "authentifizierung"],
"keywords": ["authentifizierung", "identifikation", "identity", "passwort", "mfa", "credential"],
"subcontrols": [
{
"subcontrol_id": "IA-1",
"title": "Policy and Procedures",
"statement": "Identifikations- und Authentifizierungsrichtlinien muessen dokumentiert und regelmaessig ueberprueft werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Authentifizierungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "IA-2",
"title": "Identification and Authentication",
"statement": "Benutzer und Geraete muessen eindeutig identifiziert und authentifiziert werden.",
"keywords": ["benutzer", "geraete", "identifizierung"],
"action_hint": "implement",
"object_hint": "Benutzerauthentifizierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "IA-2(1)",
"title": "Multi-Factor Authentication",
"statement": "Multi-Faktor-Authentifizierung muss fuer privilegierte Konten implementiert werden.",
"keywords": ["mfa", "multi-faktor", "zwei-faktor", "2fa"],
"action_hint": "implement",
"object_hint": "Multi-Faktor-Authentifizierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "IA-5",
"title": "Authenticator Management",
"statement": "Authentifizierungsmittel (Passwoerter, Token, Zertifikate) muessen sicher verwaltet werden.",
"keywords": ["passwort", "token", "zertifikat", "credential"],
"action_hint": "maintain",
"object_hint": "Authentifizierungsmittel-Verwaltung",
"object_class": "technical_control"
}
]
},
{
"domain_id": "IR",
"title": "Incident Response",
"aliases": ["incident response", "vorfallbehandlung", "vorfallreaktion", "incident management"],
"keywords": ["vorfall", "incident", "reaktion", "response", "breach", "sicherheitsvorfall"],
"subcontrols": [
{
"subcontrol_id": "IR-1",
"title": "Policy and Procedures",
"statement": "Vorfallreaktionsrichtlinien und -verfahren muessen definiert und regelmaessig aktualisiert werden.",
"keywords": ["policy", "richtlinie", "verfahren"],
"action_hint": "document",
"object_hint": "Vorfallreaktionsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "IR-2",
"title": "Incident Response Training",
"statement": "Mitarbeiter muessen regelmaessig in der Vorfallreaktion geschult werden.",
"keywords": ["training", "schulung"],
"action_hint": "train",
"object_hint": "Vorfallreaktionsschulung",
"object_class": "training"
},
{
"subcontrol_id": "IR-4",
"title": "Incident Handling",
"statement": "Ein strukturierter Prozess fuer die Vorfallbehandlung muss implementiert werden: Erkennung, Analyse, Eindaemmung, Behebung.",
"keywords": ["handling", "behandlung", "erkennung", "eindaemmung"],
"action_hint": "implement",
"object_hint": "Vorfallbehandlungsprozess",
"object_class": "process"
},
{
"subcontrol_id": "IR-5",
"title": "Incident Monitoring",
"statement": "Sicherheitsvorfaelle muessen kontinuierlich ueberwacht und verfolgt werden.",
"keywords": ["monitoring", "ueberwachung", "tracking"],
"action_hint": "monitor",
"object_hint": "Vorfallsueberwachung",
"object_class": "incident"
},
{
"subcontrol_id": "IR-6",
"title": "Incident Reporting",
"statement": "Sicherheitsvorfaelle muessen innerhalb definierter Fristen an die zustaendigen Stellen gemeldet werden.",
"keywords": ["reporting", "meldung", "melden", "frist"],
"action_hint": "report",
"object_hint": "Vorfallmeldung",
"object_class": "incident"
},
{
"subcontrol_id": "IR-8",
"title": "Incident Response Plan",
"statement": "Ein Vorfallreaktionsplan muss dokumentiert und regelmaessig getestet werden.",
"keywords": ["plan", "dokumentation", "test"],
"action_hint": "document",
"object_hint": "Vorfallreaktionsplan",
"object_class": "policy"
}
]
},
{
"domain_id": "RA",
"title": "Risk Assessment",
"aliases": ["risk assessment", "risikobewertung", "risikoanalyse"],
"keywords": ["risiko", "risk", "bewertung", "assessment", "analyse", "bedrohung", "threat"],
"subcontrols": [
{
"subcontrol_id": "RA-1",
"title": "Policy and Procedures",
"statement": "Risikobewertungsrichtlinien muessen dokumentiert und regelmaessig aktualisiert werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Risikobewertungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "RA-3",
"title": "Risk Assessment",
"statement": "Regelmaessige Risikobewertungen muessen durchgefuehrt und dokumentiert werden.",
"keywords": ["bewertung", "assessment", "regelmaessig"],
"action_hint": "assess",
"object_hint": "Risikobewertung",
"object_class": "risk_artifact"
},
{
"subcontrol_id": "RA-5",
"title": "Vulnerability Monitoring and Scanning",
"statement": "Systeme muessen regelmaessig auf Schwachstellen gescannt und ueberwacht werden.",
"keywords": ["vulnerability", "schwachstelle", "scan", "monitoring"],
"action_hint": "monitor",
"object_hint": "Schwachstellenueberwachung",
"object_class": "system"
}
]
},
{
"domain_id": "SC",
"title": "System and Communications Protection",
"aliases": ["system protection", "communications protection", "kommunikationsschutz", "systemschutz"],
"keywords": ["verschluesselung", "encryption", "tls", "netzwerk", "network", "kommunikation", "firewall"],
"subcontrols": [
{
"subcontrol_id": "SC-1",
"title": "Policy and Procedures",
"statement": "System- und Kommunikationsschutzrichtlinien muessen dokumentiert und aktuell gehalten werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Kommunikationsschutzrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "SC-7",
"title": "Boundary Protection",
"statement": "Netzwerkgrenzen muessen durch Firewall-Regeln und Zugangskontrollen geschuetzt werden.",
"keywords": ["boundary", "grenze", "firewall", "netzwerk"],
"action_hint": "implement",
"object_hint": "Netzwerkgrenzschutz",
"object_class": "technical_control"
},
{
"subcontrol_id": "SC-8",
"title": "Transmission Confidentiality and Integrity",
"statement": "Daten muessen bei der Uebertragung durch Verschluesselung geschuetzt werden.",
"keywords": ["transmission", "uebertragung", "verschluesselung", "tls"],
"action_hint": "encrypt",
"object_hint": "Uebertragungsverschluesselung",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "SC-12",
"title": "Cryptographic Key Establishment and Management",
"statement": "Kryptographische Schluessel muessen sicher erzeugt, verteilt, gespeichert und widerrufen werden.",
"keywords": ["key", "schluessel", "kryptographie", "management"],
"action_hint": "maintain",
"object_hint": "Schluesselverwaltung",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "SC-13",
"title": "Cryptographic Protection",
"statement": "Kryptographische Mechanismen muessen gemaess anerkannten Standards implementiert werden.",
"keywords": ["kryptographie", "verschluesselung", "standard"],
"action_hint": "implement",
"object_hint": "Kryptographischer Schutz",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "SI",
"title": "System and Information Integrity",
"aliases": ["system integrity", "information integrity", "systemintegritaet", "informationsintegritaet"],
"keywords": ["integritaet", "integrity", "malware", "patch", "flaw", "schwachstelle"],
"subcontrols": [
{
"subcontrol_id": "SI-1",
"title": "Policy and Procedures",
"statement": "System- und Informationsintegritaetsrichtlinien muessen dokumentiert und regelmaessig ueberprueft werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Integritaetsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "SI-2",
"title": "Flaw Remediation",
"statement": "Bekannte Schwachstellen muessen innerhalb definierter Fristen behoben werden.",
"keywords": ["flaw", "schwachstelle", "patch", "behebung", "remediation"],
"action_hint": "remediate",
"object_hint": "Schwachstellenbehebung",
"object_class": "system"
},
{
"subcontrol_id": "SI-3",
"title": "Malicious Code Protection",
"statement": "Systeme muessen vor Schadsoftware geschuetzt werden durch Erkennung und Abwehrmechanismen.",
"keywords": ["malware", "schadsoftware", "antivirus", "erkennung"],
"action_hint": "implement",
"object_hint": "Schadsoftwareschutz",
"object_class": "technical_control"
},
{
"subcontrol_id": "SI-4",
"title": "System Monitoring",
"statement": "Systeme muessen kontinuierlich auf Sicherheitsereignisse und Anomalien ueberwacht werden.",
"keywords": ["monitoring", "ueberwachung", "anomalie", "siem"],
"action_hint": "monitor",
"object_hint": "Systemueberwachung",
"object_class": "system"
},
{
"subcontrol_id": "SI-5",
"title": "Security Alerts and Advisories",
"statement": "Sicherheitswarnungen muessen empfangen, bewertet und darauf reagiert werden.",
"keywords": ["alert", "warnung", "advisory", "cve"],
"action_hint": "monitor",
"object_hint": "Sicherheitswarnungen",
"object_class": "incident"
}
]
},
{
"domain_id": "SA",
"title": "System and Services Acquisition",
"aliases": ["system acquisition", "services acquisition", "systembeschaffung", "secure development"],
"keywords": ["beschaffung", "acquisition", "entwicklung", "development", "lieferkette", "supply chain"],
"subcontrols": [
{
"subcontrol_id": "SA-1",
"title": "Policy and Procedures",
"statement": "Beschaffungsrichtlinien mit Sicherheitsanforderungen muessen dokumentiert werden.",
"keywords": ["policy", "richtlinie", "beschaffung"],
"action_hint": "document",
"object_hint": "Beschaffungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "SA-8",
"title": "Security and Privacy Engineering Principles",
"statement": "Sicherheits- und Datenschutzprinzipien muessen in die Systementwicklung integriert werden.",
"keywords": ["engineering", "development", "prinzipien", "design"],
"action_hint": "implement",
"object_hint": "Security-by-Design-Prinzipien",
"object_class": "process"
},
{
"subcontrol_id": "SA-11",
"title": "Developer Testing and Evaluation",
"statement": "Entwickler muessen Sicherheitstests und Code-Reviews durchfuehren.",
"keywords": ["testing", "test", "code review", "evaluation"],
"action_hint": "test",
"object_hint": "Entwickler-Sicherheitstests",
"object_class": "process"
},
{
"subcontrol_id": "SA-12",
"title": "Supply Chain Protection",
"statement": "Lieferkettenrisiken muessen bewertet und Schutzmassnahmen implementiert werden.",
"keywords": ["supply chain", "lieferkette", "third party", "drittanbieter"],
"action_hint": "assess",
"object_hint": "Lieferkettenrisikobewertung",
"object_class": "risk_artifact"
}
]
}
]
}

View File

@@ -0,0 +1,353 @@
{
"framework_id": "OWASP_ASVS",
"display_name": "OWASP Application Security Verification Standard 4.0",
"license": {
"type": "cc_by_sa_4",
"rag_allowed": true,
"use_as_metadata": true
},
"domains": [
{
"domain_id": "V1",
"title": "Architecture, Design and Threat Modeling",
"aliases": ["architecture", "architektur", "design", "threat modeling", "bedrohungsmodellierung"],
"keywords": ["architektur", "design", "threat model", "bedrohung", "modellierung"],
"subcontrols": [
{
"subcontrol_id": "V1.1",
"title": "Secure Software Development Lifecycle",
"statement": "Ein sicherer Softwareentwicklungs-Lebenszyklus (SSDLC) muss definiert und angewendet werden.",
"keywords": ["sdlc", "lifecycle", "lebenszyklus", "entwicklung"],
"action_hint": "implement",
"object_hint": "Sicherer Entwicklungs-Lebenszyklus",
"object_class": "process"
},
{
"subcontrol_id": "V1.2",
"title": "Authentication Architecture",
"statement": "Die Authentifizierungsarchitektur muss dokumentiert und regelmaessig ueberprueft werden.",
"keywords": ["authentication", "authentifizierung", "architektur"],
"action_hint": "document",
"object_hint": "Authentifizierungsarchitektur",
"object_class": "policy"
},
{
"subcontrol_id": "V1.4",
"title": "Access Control Architecture",
"statement": "Die Zugriffskontrollarchitektur muss dokumentiert und zentral durchgesetzt werden.",
"keywords": ["access control", "zugriffskontrolle", "architektur"],
"action_hint": "document",
"object_hint": "Zugriffskontrollarchitektur",
"object_class": "policy"
},
{
"subcontrol_id": "V1.5",
"title": "Input and Output Architecture",
"statement": "Eingabe- und Ausgabevalidierung muss architektonisch verankert und durchgaengig angewendet werden.",
"keywords": ["input", "output", "eingabe", "ausgabe", "validierung"],
"action_hint": "implement",
"object_hint": "Ein-/Ausgabevalidierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V1.6",
"title": "Cryptographic Architecture",
"statement": "Kryptographische Mechanismen muessen architektonisch definiert und standardisiert sein.",
"keywords": ["crypto", "kryptographie", "verschluesselung"],
"action_hint": "define",
"object_hint": "Kryptographie-Architektur",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "V2",
"title": "Authentication",
"aliases": ["authentication", "authentifizierung", "anmeldung", "login"],
"keywords": ["authentication", "authentifizierung", "passwort", "login", "anmeldung", "credential"],
"subcontrols": [
{
"subcontrol_id": "V2.1",
"title": "Password Security",
"statement": "Passwortrichtlinien muessen Mindestlaenge, Komplexitaet und Sperrmechanismen definieren.",
"keywords": ["passwort", "password", "laenge", "komplexitaet"],
"action_hint": "define",
"object_hint": "Passwortrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "V2.2",
"title": "General Authenticator Security",
"statement": "Authentifizierungsmittel muessen sicher gespeichert und uebertragen werden.",
"keywords": ["authenticator", "credential", "speicherung"],
"action_hint": "implement",
"object_hint": "Sichere Credential-Verwaltung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V2.7",
"title": "Out-of-Band Verification",
"statement": "Out-of-Band-Verifikationsmechanismen muessen sicher implementiert werden.",
"keywords": ["oob", "out-of-band", "sms", "push"],
"action_hint": "implement",
"object_hint": "Out-of-Band-Verifikation",
"object_class": "technical_control"
},
{
"subcontrol_id": "V2.8",
"title": "Multi-Factor Authentication",
"statement": "Multi-Faktor-Authentifizierung muss fuer sicherheitskritische Funktionen verfuegbar sein.",
"keywords": ["mfa", "multi-faktor", "totp", "fido"],
"action_hint": "implement",
"object_hint": "Multi-Faktor-Authentifizierung",
"object_class": "technical_control"
}
]
},
{
"domain_id": "V3",
"title": "Session Management",
"aliases": ["session", "sitzung", "session management", "sitzungsverwaltung"],
"keywords": ["session", "sitzung", "token", "cookie", "timeout"],
"subcontrols": [
{
"subcontrol_id": "V3.1",
"title": "Session Management Security",
"statement": "Sitzungstoken muessen sicher erzeugt, uebertragen und invalidiert werden.",
"keywords": ["token", "sitzung", "sicherheit"],
"action_hint": "implement",
"object_hint": "Sichere Sitzungsverwaltung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V3.3",
"title": "Session Termination",
"statement": "Sitzungen muessen nach Inaktivitaet und bei Abmeldung zuverlaessig beendet werden.",
"keywords": ["termination", "timeout", "abmeldung", "beenden"],
"action_hint": "configure",
"object_hint": "Sitzungstimeout",
"object_class": "configuration"
},
{
"subcontrol_id": "V3.5",
"title": "Token-Based Session Management",
"statement": "Tokenbasierte Sitzungsmechanismen muessen gegen Diebstahl und Replay geschuetzt sein.",
"keywords": ["jwt", "token", "replay", "diebstahl"],
"action_hint": "implement",
"object_hint": "Token-Schutz",
"object_class": "technical_control"
}
]
},
{
"domain_id": "V5",
"title": "Validation, Sanitization and Encoding",
"aliases": ["validation", "validierung", "sanitization", "encoding", "eingabevalidierung"],
"keywords": ["validierung", "sanitization", "encoding", "xss", "injection", "eingabe"],
"subcontrols": [
{
"subcontrol_id": "V5.1",
"title": "Input Validation",
"statement": "Alle Eingabedaten muessen serverseitig validiert werden.",
"keywords": ["input", "eingabe", "validierung", "serverseitig"],
"action_hint": "implement",
"object_hint": "Eingabevalidierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V5.2",
"title": "Sanitization and Sandboxing",
"statement": "Eingaben muessen bereinigt und in sicherer Umgebung verarbeitet werden.",
"keywords": ["sanitization", "bereinigung", "sandbox"],
"action_hint": "implement",
"object_hint": "Eingabebereinigung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V5.3",
"title": "Output Encoding and Injection Prevention",
"statement": "Ausgaben muessen kontextabhaengig kodiert werden, um Injection-Angriffe zu verhindern.",
"keywords": ["output", "encoding", "injection", "xss", "sql"],
"action_hint": "implement",
"object_hint": "Ausgabe-Encoding",
"object_class": "technical_control"
}
]
},
{
"domain_id": "V6",
"title": "Stored Cryptography",
"aliases": ["cryptography", "kryptographie", "verschluesselung", "stored cryptography"],
"keywords": ["kryptographie", "verschluesselung", "hashing", "schluessel", "key management"],
"subcontrols": [
{
"subcontrol_id": "V6.1",
"title": "Data Classification",
"statement": "Daten muessen klassifiziert und entsprechend ihrer Schutzklasse behandelt werden.",
"keywords": ["klassifizierung", "classification", "schutzklasse"],
"action_hint": "define",
"object_hint": "Datenklassifizierung",
"object_class": "data"
},
{
"subcontrol_id": "V6.2",
"title": "Algorithms",
"statement": "Nur zugelassene und aktuelle kryptographische Algorithmen duerfen verwendet werden.",
"keywords": ["algorithmus", "algorithm", "aes", "rsa"],
"action_hint": "configure",
"object_hint": "Kryptographische Algorithmen",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "V6.4",
"title": "Secret Management",
"statement": "Geheimnisse (Schluessel, Passwoerter, Tokens) muessen in einem Secret-Management-System verwaltet werden.",
"keywords": ["secret", "geheimnis", "vault", "key management"],
"action_hint": "maintain",
"object_hint": "Secret-Management",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "V8",
"title": "Data Protection",
"aliases": ["data protection", "datenschutz", "datenverarbeitung"],
"keywords": ["datenschutz", "data protection", "pii", "personenbezogen", "privacy"],
"subcontrols": [
{
"subcontrol_id": "V8.1",
"title": "General Data Protection",
"statement": "Personenbezogene Daten muessen gemaess Datenschutzanforderungen geschuetzt werden.",
"keywords": ["personenbezogen", "pii", "datenschutz"],
"action_hint": "implement",
"object_hint": "Datenschutzmassnahmen",
"object_class": "data"
},
{
"subcontrol_id": "V8.2",
"title": "Client-Side Data Protection",
"statement": "Clientseitig gespeicherte sensible Daten muessen geschuetzt und minimiert werden.",
"keywords": ["client", "browser", "localstorage", "cookie"],
"action_hint": "implement",
"object_hint": "Clientseitiger Datenschutz",
"object_class": "technical_control"
},
{
"subcontrol_id": "V8.3",
"title": "Sensitive Private Data",
"statement": "Sensible Daten muessen bei Speicherung und Verarbeitung besonders geschuetzt werden.",
"keywords": ["sensibel", "vertraulich", "speicherung"],
"action_hint": "encrypt",
"object_hint": "Verschluesselung sensibler Daten",
"object_class": "data"
}
]
},
{
"domain_id": "V9",
"title": "Communication",
"aliases": ["communication", "kommunikation", "tls", "transport"],
"keywords": ["tls", "ssl", "https", "transport", "kommunikation", "verschluesselung"],
"subcontrols": [
{
"subcontrol_id": "V9.1",
"title": "Client Communication Security",
"statement": "Alle Client-Server-Kommunikation muss ueber TLS verschluesselt werden.",
"keywords": ["tls", "https", "client", "server"],
"action_hint": "encrypt",
"object_hint": "TLS-Transportverschluesselung",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "V9.2",
"title": "Server Communication Security",
"statement": "Server-zu-Server-Kommunikation muss authentifiziert und verschluesselt erfolgen.",
"keywords": ["server", "mtls", "backend"],
"action_hint": "encrypt",
"object_hint": "Server-Kommunikationsverschluesselung",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "V13",
"title": "API and Web Service",
"aliases": ["api", "web service", "rest", "graphql", "webservice"],
"keywords": ["api", "rest", "graphql", "webservice", "endpoint", "schnittstelle"],
"subcontrols": [
{
"subcontrol_id": "V13.1",
"title": "Generic Web Service Security",
"statement": "Web-Services muessen gegen gaengige Angriffe abgesichert werden.",
"keywords": ["web service", "sicherheit", "angriff"],
"action_hint": "implement",
"object_hint": "Web-Service-Absicherung",
"object_class": "interface"
},
{
"subcontrol_id": "V13.2",
"title": "RESTful Web Service",
"statement": "REST-APIs muessen Input-Validierung, Rate Limiting und sichere Authentifizierung implementieren.",
"keywords": ["rest", "api", "rate limiting", "input"],
"action_hint": "implement",
"object_hint": "REST-API-Absicherung",
"object_class": "interface"
},
{
"subcontrol_id": "V13.4",
"title": "GraphQL and Web Services",
"statement": "GraphQL-Endpoints muessen gegen Query-Complexity-Angriffe und Introspection geschuetzt werden.",
"keywords": ["graphql", "query", "complexity", "introspection"],
"action_hint": "configure",
"object_hint": "GraphQL-Absicherung",
"object_class": "interface"
}
]
},
{
"domain_id": "V14",
"title": "Configuration",
"aliases": ["configuration", "konfiguration", "hardening", "haertung"],
"keywords": ["konfiguration", "hardening", "haertung", "header", "deployment"],
"subcontrols": [
{
"subcontrol_id": "V14.1",
"title": "Build and Deploy",
"statement": "Build- und Deployment-Prozesse muessen sicher konfiguriert und reproduzierbar sein.",
"keywords": ["build", "deploy", "ci/cd", "pipeline"],
"action_hint": "configure",
"object_hint": "Sichere Build-Pipeline",
"object_class": "configuration"
},
{
"subcontrol_id": "V14.2",
"title": "Dependency Management",
"statement": "Abhaengigkeiten muessen auf Schwachstellen geprueft und aktuell gehalten werden.",
"keywords": ["dependency", "abhaengigkeit", "sca", "sbom"],
"action_hint": "maintain",
"object_hint": "Abhaengigkeitsverwaltung",
"object_class": "system"
},
{
"subcontrol_id": "V14.3",
"title": "Unintended Security Disclosure",
"statement": "Fehlermeldungen und Debug-Informationen duerfen keine sicherheitsrelevanten Details preisgeben.",
"keywords": ["disclosure", "fehlermeldung", "debug", "information leakage"],
"action_hint": "configure",
"object_hint": "Fehlerbehandlung",
"object_class": "configuration"
},
{
"subcontrol_id": "V14.4",
"title": "HTTP Security Headers",
"statement": "HTTP-Sicherheitsheader muessen korrekt konfiguriert sein.",
"keywords": ["header", "csp", "hsts", "x-frame"],
"action_hint": "configure",
"object_hint": "HTTP-Sicherheitsheader",
"object_class": "configuration"
}
]
}
]
}

View File

@@ -0,0 +1,205 @@
"""
Source-Type-Klassifikation fuer Regulierungen und Frameworks.
Dreistufiges Modell der normativen Verbindlichkeit:
Stufe 1 — GESETZ (law):
Rechtlich bindend. Bussgeld bei Verstoss.
Beispiele: DSGVO, NIS2, AI Act, CRA
Stufe 2 — LEITLINIE (guideline):
Offizielle Auslegungshilfe von Aufsichtsbehoerden.
Beweislastumkehr: Wer abweicht, muss begruenden warum.
Beispiele: EDPB-Leitlinien, BSI-Standards, WP29-Dokumente
Stufe 3 — FRAMEWORK (framework):
Freiwillige Best Practices, nicht rechtsverbindlich.
Aber: Koennen als "Stand der Technik" herangezogen werden.
Beispiele: ENISA, NIST, OWASP, OECD, CISA
Mapping: source_regulation (aus control_parent_links) -> source_type
"""
# --- Typ-Definitionen ---
SOURCE_TYPE_LAW = "law" # Gesetz/Verordnung/Richtlinie — normative_strength bleibt
SOURCE_TYPE_GUIDELINE = "guideline" # Leitlinie/Standard — max "should"
SOURCE_TYPE_FRAMEWORK = "framework" # Framework/Best Practice — max "may"
# Max erlaubte normative_strength pro source_type
# DB-Constraint erlaubt: must, should, may (NICHT "can")
NORMATIVE_STRENGTH_CAP: dict[str, str] = {
SOURCE_TYPE_LAW: "must", # keine Begrenzung
SOURCE_TYPE_GUIDELINE: "should", # max "should"
SOURCE_TYPE_FRAMEWORK: "may", # max "may" (= "kann")
}
# Reihenfolge fuer Vergleiche (hoeher = staerker)
STRENGTH_ORDER: dict[str, int] = {
"may": 1, # KANN (DB-Wert)
"can": 1, # Alias — wird in cap_normative_strength zu "may" normalisiert
"should": 2,
"must": 3,
}
def cap_normative_strength(original: str, source_type: str) -> str:
"""
Begrenzt die normative_strength basierend auf dem source_type.
Beispiel:
cap_normative_strength("must", "framework") -> "may"
cap_normative_strength("should", "law") -> "should"
cap_normative_strength("must", "guideline") -> "should"
"""
cap = NORMATIVE_STRENGTH_CAP.get(source_type, "must")
cap_level = STRENGTH_ORDER.get(cap, 3)
original_level = STRENGTH_ORDER.get(original, 3)
if original_level > cap_level:
return cap
return original
def get_highest_source_type(source_types: list[str]) -> str:
"""
Bestimmt den hoechsten source_type aus einer Liste.
Ein Gesetz uebertrumpft alles.
Beispiel:
get_highest_source_type(["framework", "law"]) -> "law"
get_highest_source_type(["framework", "guideline"]) -> "guideline"
"""
type_order = {SOURCE_TYPE_FRAMEWORK: 1, SOURCE_TYPE_GUIDELINE: 2, SOURCE_TYPE_LAW: 3}
if not source_types:
return SOURCE_TYPE_FRAMEWORK
return max(source_types, key=lambda t: type_order.get(t, 0))
# ============================================================================
# Klassifikation: source_regulation -> source_type
#
# Diese Map wird fuer den Backfill und zukuenftige Pipeline-Runs verwendet.
# Neue Regulierungen hier eintragen!
# ============================================================================
SOURCE_REGULATION_CLASSIFICATION: dict[str, str] = {
# --- EU-Verordnungen (unmittelbar bindend) ---
"DSGVO (EU) 2016/679": SOURCE_TYPE_LAW,
"KI-Verordnung (EU) 2024/1689": SOURCE_TYPE_LAW,
"Cyber Resilience Act (CRA)": SOURCE_TYPE_LAW,
"NIS2-Richtlinie (EU) 2022/2555": SOURCE_TYPE_LAW,
"Data Act": SOURCE_TYPE_LAW,
"Data Governance Act (DGA)": SOURCE_TYPE_LAW,
"Markets in Crypto-Assets (MiCA)": SOURCE_TYPE_LAW,
"Maschinenverordnung (EU) 2023/1230": SOURCE_TYPE_LAW,
"Batterieverordnung (EU) 2023/1542": SOURCE_TYPE_LAW,
"AML-Verordnung": SOURCE_TYPE_LAW,
# --- EU-Richtlinien (nach nationaler Umsetzung bindend) ---
# Fuer Compliance-Zwecke wie Gesetze behandeln
# --- Nationale Gesetze ---
"Bundesdatenschutzgesetz (BDSG)": SOURCE_TYPE_LAW,
"Telekommunikationsgesetz": SOURCE_TYPE_LAW,
"Telekommunikationsgesetz Oesterreich": SOURCE_TYPE_LAW,
"Gewerbeordnung (GewO)": SOURCE_TYPE_LAW,
"Handelsgesetzbuch (HGB)": SOURCE_TYPE_LAW,
"Abgabenordnung (AO)": SOURCE_TYPE_LAW,
"IFRS-Übernahmeverordnung": SOURCE_TYPE_LAW,
"Österreichisches Datenschutzgesetz (DSG)": SOURCE_TYPE_LAW,
"LOPDGDD - Ley Orgánica de Protección de Datos (Spanien)": SOURCE_TYPE_LAW,
"Loi Informatique et Libertés (Frankreich)": SOURCE_TYPE_LAW,
"Információs önrendelkezési jog törvény (Ungarn)": SOURCE_TYPE_LAW,
"EU Blue Guide 2022": SOURCE_TYPE_LAW,
# --- EDPB/WP29 Leitlinien (offizielle Auslegungshilfe) ---
"EDPB Leitlinien 01/2019 (Zertifizierung)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 01/2020 (Datentransfers)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 01/2020 (Vernetzte Fahrzeuge)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 01/2022 (BCR)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 01/2024 (Berechtigtes Interesse)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 04/2019 (Data Protection by Design)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 05/2020 - Einwilligung": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 07/2020 (Datentransfers)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 08/2020 (Social Media)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 09/2022 (Data Breach)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 09/2022 - Meldung von Datenschutzverletzungen": SOURCE_TYPE_GUIDELINE,
"EDPB Empfehlungen 01/2020 - Ergaenzende Massnahmen fuer Datentransfers": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien - Berechtigtes Interesse (Art. 6(1)(f))": SOURCE_TYPE_GUIDELINE,
"WP244 Leitlinien (Profiling)": SOURCE_TYPE_GUIDELINE,
"WP251 Leitlinien (Profiling)": SOURCE_TYPE_GUIDELINE,
"WP260 Leitlinien (Transparenz)": SOURCE_TYPE_GUIDELINE,
# --- BSI Standards (behoerdliche technische Richtlinien) ---
"BSI-TR-03161-1": SOURCE_TYPE_GUIDELINE,
"BSI-TR-03161-2": SOURCE_TYPE_GUIDELINE,
"BSI-TR-03161-3": SOURCE_TYPE_GUIDELINE,
# --- ENISA (EU-Agentur, aber Empfehlungen nicht rechtsverbindlich) ---
"ENISA Cybersecurity State 2024": SOURCE_TYPE_FRAMEWORK,
"ENISA ICS/SCADA Dependencies": SOURCE_TYPE_FRAMEWORK,
"ENISA Supply Chain Good Practices": SOURCE_TYPE_FRAMEWORK,
"ENISA Threat Landscape Supply Chain": SOURCE_TYPE_FRAMEWORK,
# --- NIST (US-Standards, international als Best Practice) ---
"NIST AI Risk Management Framework": SOURCE_TYPE_FRAMEWORK,
"NIST Cybersecurity Framework 2.0": SOURCE_TYPE_FRAMEWORK,
"NIST SP 800-207 (Zero Trust)": SOURCE_TYPE_FRAMEWORK,
"NIST SP 800-218 (SSDF)": SOURCE_TYPE_FRAMEWORK,
"NIST SP 800-53 Rev. 5": SOURCE_TYPE_FRAMEWORK,
"NIST SP 800-63-3": SOURCE_TYPE_FRAMEWORK,
# --- OWASP (Community-Standards) ---
"OWASP API Security Top 10 (2023)": SOURCE_TYPE_FRAMEWORK,
"OWASP ASVS 4.0": SOURCE_TYPE_FRAMEWORK,
"OWASP MASVS 2.0": SOURCE_TYPE_FRAMEWORK,
"OWASP SAMM 2.0": SOURCE_TYPE_FRAMEWORK,
"OWASP Top 10 (2021)": SOURCE_TYPE_FRAMEWORK,
# --- Sonstige Frameworks ---
"OECD KI-Empfehlung": SOURCE_TYPE_FRAMEWORK,
"CISA Secure by Design": SOURCE_TYPE_FRAMEWORK,
}
def classify_source_regulation(source_regulation: str) -> str:
"""
Klassifiziert eine source_regulation als law, guideline oder framework.
Verwendet exaktes Matching gegen die Map. Bei unbekannten Quellen
wird anhand von Schluesselwoertern geraten, Fallback ist 'framework'
(konservativstes Ergebnis).
"""
if not source_regulation:
return SOURCE_TYPE_FRAMEWORK
# Exaktes Match
if source_regulation in SOURCE_REGULATION_CLASSIFICATION:
return SOURCE_REGULATION_CLASSIFICATION[source_regulation]
# Heuristik fuer unbekannte Quellen
lower = source_regulation.lower()
# Gesetze erkennen
law_indicators = [
"verordnung", "richtlinie", "gesetz", "directive", "regulation",
"(eu)", "(eg)", "act", "ley", "loi", "törvény", "código",
]
if any(ind in lower for ind in law_indicators):
return SOURCE_TYPE_LAW
# Leitlinien erkennen
guideline_indicators = [
"edpb", "leitlinie", "guideline", "wp2", "bsi", "empfehlung",
]
if any(ind in lower for ind in guideline_indicators):
return SOURCE_TYPE_GUIDELINE
# Frameworks erkennen
framework_indicators = [
"enisa", "nist", "owasp", "oecd", "cisa", "framework", "iso",
]
if any(ind in lower for ind in framework_indicators):
return SOURCE_TYPE_FRAMEWORK
# Konservativ: unbekannt = framework (geringste Verbindlichkeit)
return SOURCE_TYPE_FRAMEWORK

View File

@@ -8,12 +8,16 @@ from .models import (
EvidenceDB,
RiskDB,
AuditExportDB,
LLMGenerationAuditDB,
AssertionDB,
RegulationTypeEnum,
ControlTypeEnum,
ControlDomainEnum,
RiskLevelEnum,
EvidenceStatusEnum,
ControlStatusEnum,
EvidenceConfidenceEnum,
EvidenceTruthStatusEnum,
)
from .repository import (
RegulationRepository,
@@ -33,6 +37,8 @@ __all__ = [
"EvidenceDB",
"RiskDB",
"AuditExportDB",
"LLMGenerationAuditDB",
"AssertionDB",
# Enums
"RegulationTypeEnum",
"ControlTypeEnum",
@@ -40,6 +46,8 @@ __all__ = [
"RiskLevelEnum",
"EvidenceStatusEnum",
"ControlStatusEnum",
"EvidenceConfidenceEnum",
"EvidenceTruthStatusEnum",
# Repositories
"RegulationRepository",
"RequirementRepository",

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,164 @@
"""
SQLAlchemy models for VVT Master Libraries + Process Templates.
Tables (global, no tenant_id):
- vvt_lib_data_subjects
- vvt_lib_data_categories (hierarchical, self-referencing)
- vvt_lib_recipients
- vvt_lib_legal_bases
- vvt_lib_retention_rules
- vvt_lib_transfer_mechanisms
- vvt_lib_purposes
- vvt_lib_toms
Tenant-scoped:
- vvt_process_templates (system + tenant-specific)
"""
from datetime import datetime
from sqlalchemy import (
Column, String, Text, Boolean, Integer, DateTime, JSON, Index,
ForeignKey,
)
from sqlalchemy.dialects.postgresql import UUID
from classroom_engine.database import Base
class VVTLibDataSubjectDB(Base):
__tablename__ = 'vvt_lib_data_subjects'
id = Column(String(50), primary_key=True)
label_de = Column(String(200), nullable=False)
description_de = Column(Text)
art9_relevant = Column(Boolean, default=False)
typical_for = Column(JSON, default=list)
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibDataCategoryDB(Base):
__tablename__ = 'vvt_lib_data_categories'
id = Column(String(50), primary_key=True)
parent_id = Column(String(50), ForeignKey('vvt_lib_data_categories.id', ondelete='SET NULL'), nullable=True)
label_de = Column(String(200), nullable=False)
description_de = Column(Text)
is_art9 = Column(Boolean, default=False)
is_art10 = Column(Boolean, default=False)
risk_weight = Column(Integer, default=1)
default_retention_rule = Column(String(50))
default_legal_basis = Column(String(50))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibRecipientDB(Base):
__tablename__ = 'vvt_lib_recipients'
id = Column(String(50), primary_key=True)
type = Column(String(20), nullable=False)
label_de = Column(String(200), nullable=False)
description_de = Column(Text)
is_third_country = Column(Boolean, default=False)
country = Column(String(5))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibLegalBasisDB(Base):
__tablename__ = 'vvt_lib_legal_bases'
id = Column(String(50), primary_key=True)
article = Column(String(50), nullable=False)
type = Column(String(30), nullable=False)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
is_art9 = Column(Boolean, default=False)
typical_national_law = Column(String(100))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibRetentionRuleDB(Base):
__tablename__ = 'vvt_lib_retention_rules'
id = Column(String(50), primary_key=True)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
legal_basis = Column(String(200))
duration = Column(Integer, nullable=False)
duration_unit = Column(String(10), nullable=False)
start_event = Column(String(200))
deletion_procedure = Column(String(500))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibTransferMechanismDB(Base):
__tablename__ = 'vvt_lib_transfer_mechanisms'
id = Column(String(50), primary_key=True)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
article = Column(String(50))
requires_tia = Column(Boolean, default=False)
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibPurposeDB(Base):
__tablename__ = 'vvt_lib_purposes'
id = Column(String(50), primary_key=True)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
typical_legal_basis = Column(String(50))
typical_for = Column(JSON, default=list)
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibTomDB(Base):
__tablename__ = 'vvt_lib_toms'
id = Column(String(50), primary_key=True)
category = Column(String(30), nullable=False)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
art32_reference = Column(String(100))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTProcessTemplateDB(Base):
__tablename__ = 'vvt_process_templates'
id = Column(String(80), primary_key=True)
name = Column(String(300), nullable=False)
description = Column(Text)
business_function = Column(String(50))
purpose_refs = Column(JSON, default=list)
legal_basis_refs = Column(JSON, default=list)
data_subject_refs = Column(JSON, default=list)
data_category_refs = Column(JSON, default=list)
recipient_refs = Column(JSON, default=list)
tom_refs = Column(JSON, default=list)
transfer_mechanism_refs = Column(JSON, default=list)
retention_rule_ref = Column(String(50))
typical_systems = Column(JSON, default=list)
protection_level = Column(String(10), default='MEDIUM')
dpia_required = Column(Boolean, default=False)
risk_score = Column(Integer)
tags = Column(JSON, default=list)
is_system = Column(Boolean, default=True)
tenant_id = Column(UUID(as_uuid=True), nullable=True)
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
__table_args__ = (
Index('idx_vvt_process_templates_bf', 'business_function'),
Index('idx_vvt_process_templates_system', 'is_system'),
)

View File

@@ -79,6 +79,26 @@ class VVTActivityDB(Base):
next_review_at = Column(DateTime(timezone=True), nullable=True)
created_by = Column(String(200), default='system')
dsfa_id = Column(UUID(as_uuid=True), nullable=True)
# Library refs (Phase 1 — parallel to freetext fields)
purpose_refs = Column(JSON, nullable=True)
legal_basis_refs = Column(JSON, nullable=True)
data_subject_refs = Column(JSON, nullable=True)
data_category_refs = Column(JSON, nullable=True)
recipient_refs = Column(JSON, nullable=True)
retention_rule_ref = Column(String(50), nullable=True)
transfer_mechanism_refs = Column(JSON, nullable=True)
tom_refs = Column(JSON, nullable=True)
# Cross-module links
linked_loeschfristen_ids = Column(JSON, nullable=True)
linked_tom_measure_ids = Column(JSON, nullable=True)
# Template + risk
source_template_id = Column(String(80), nullable=True)
risk_score = Column(Integer, nullable=True)
art30_completeness = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

View File

@@ -69,7 +69,7 @@ class AnchorFinder:
tags_str = " ".join(control.tags[:3]) if control.tags else ""
query = f"{control.title} {tags_str}".strip()
results = await self.rag.search(
results = await self.rag.search_with_rerank(
query=query,
collection="bp_compliance_ce",
top_k=15,

View File

@@ -0,0 +1,80 @@
"""Assertion Engine — splits text into sentences and classifies each.
Each sentence is tagged as:
- assertion: normative statement (pflicht / empfehlung / kann)
- fact: references concrete evidence artifacts
- rationale: explains why something is required
"""
import re
from typing import Optional
from .normative_patterns import (
PFLICHT_RE, EMPFEHLUNG_RE, KANN_RE, RATIONALE_RE, EVIDENCE_RE,
)
# Sentence splitter: period/excl/question followed by space+uppercase, or newlines
_SENTENCE_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ])|(?:\n\s*\n)')
def extract_assertions(
text: str,
entity_type: str,
entity_id: str,
tenant_id: Optional[str] = None,
) -> list[dict]:
"""Split *text* into sentences and classify each one.
Returns a list of dicts ready for AssertionDB creation.
"""
if not text or not text.strip():
return []
sentences = _SENTENCE_SPLIT.split(text.strip())
results: list[dict] = []
for idx, raw in enumerate(sentences):
sentence = raw.strip()
if not sentence or len(sentence) < 5:
continue
assertion_type, normative_tier = _classify_sentence(sentence)
results.append({
"tenant_id": tenant_id,
"entity_type": entity_type,
"entity_id": entity_id,
"sentence_text": sentence,
"sentence_index": idx,
"assertion_type": assertion_type,
"normative_tier": normative_tier,
"evidence_ids": [],
"confidence": 0.0,
})
return results
def _classify_sentence(sentence: str) -> tuple[str, Optional[str]]:
"""Return (assertion_type, normative_tier) for a single sentence."""
# 1. Check for evidence/fact keywords first
if EVIDENCE_RE.search(sentence):
return ("fact", None)
# 2. Check for rationale
normative_count = len(PFLICHT_RE.findall(sentence)) + len(EMPFEHLUNG_RE.findall(sentence)) + len(KANN_RE.findall(sentence))
rationale_count = len(RATIONALE_RE.findall(sentence))
if rationale_count > 0 and rationale_count >= normative_count:
return ("rationale", None)
# 3. Normative classification
if PFLICHT_RE.search(sentence):
return ("assertion", "pflicht")
if EMPFEHLUNG_RE.search(sentence):
return ("assertion", "empfehlung")
if KANN_RE.search(sentence):
return ("assertion", "kann")
# 4. Default: unclassified assertion
return ("assertion", None)

View File

@@ -0,0 +1,618 @@
"""Batch Dedup Runner — Orchestrates deduplication of ~85k atomare Controls.
Reduces Pass 0b controls from ~85k to ~18-25k unique Master Controls via:
Phase 1: Intra-Group Dedup — same merge_group_hint → pick best, link rest
(85k → ~52k, mostly title-identical short-circuit, no embeddings)
Phase 2: Cross-Group Dedup — embed masters, search Qdrant for similar
masters with different hints (52k → ~18-25k)
All Pass 0b controls have pattern_id=NULL. The primary grouping key is
merge_group_hint (format: "action_type:norm_obj:trigger_key"), which
encodes the normalized action, object, and trigger.
Usage:
runner = BatchDedupRunner(db)
stats = await runner.run(dry_run=True) # preview
stats = await runner.run(dry_run=False) # execute
stats = await runner.run(hint_filter="implement:multi_factor_auth:none")
"""
import json
import logging
import time
from collections import defaultdict
from sqlalchemy import text
from compliance.services.control_dedup import (
canonicalize_text,
ensure_qdrant_collection,
get_embedding,
normalize_action,
normalize_object,
qdrant_search_cross_regulation,
qdrant_upsert,
LINK_THRESHOLD,
REVIEW_THRESHOLD,
)
logger = logging.getLogger(__name__)
DEDUP_COLLECTION = "atomic_controls_dedup"
# ── Quality Score ────────────────────────────────────────────────────────
def quality_score(control: dict) -> float:
"""Score a control by richness of requirements, tests, evidence, and objective.
Higher score = better candidate for master control.
"""
score = 0.0
reqs = control.get("requirements") or "[]"
if isinstance(reqs, str):
try:
reqs = json.loads(reqs)
except (json.JSONDecodeError, TypeError):
reqs = []
score += len(reqs) * 2.0
tests = control.get("test_procedure") or "[]"
if isinstance(tests, str):
try:
tests = json.loads(tests)
except (json.JSONDecodeError, TypeError):
tests = []
score += len(tests) * 1.5
evidence = control.get("evidence") or "[]"
if isinstance(evidence, str):
try:
evidence = json.loads(evidence)
except (json.JSONDecodeError, TypeError):
evidence = []
score += len(evidence) * 1.0
objective = control.get("objective") or ""
score += min(len(objective) / 200, 3.0)
return score
# ── Batch Dedup Runner ───────────────────────────────────────────────────
class BatchDedupRunner:
"""Batch dedup orchestrator for existing Pass 0b atomic controls."""
def __init__(self, db, collection: str = DEDUP_COLLECTION):
self.db = db
self.collection = collection
self.stats = {
"total_controls": 0,
"unique_hints": 0,
"phase1_groups_processed": 0,
"masters": 0,
"linked": 0,
"review": 0,
"new_controls": 0,
"parent_links_transferred": 0,
"cross_group_linked": 0,
"cross_group_review": 0,
"errors": 0,
"skipped_title_identical": 0,
}
self._progress_phase = ""
self._progress_count = 0
self._progress_total = 0
async def run(
self,
dry_run: bool = False,
hint_filter: str = None,
) -> dict:
"""Run the full batch dedup pipeline.
Args:
dry_run: If True, compute stats but don't modify DB/Qdrant.
hint_filter: If set, only process groups matching this hint prefix.
Returns:
Stats dict with counts.
"""
start = time.monotonic()
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)",
dry_run, hint_filter)
if not dry_run:
await ensure_qdrant_collection(collection=self.collection)
# Phase 1: Intra-group dedup (same merge_group_hint)
self._progress_phase = "phase1"
groups = self._load_merge_groups(hint_filter)
self._progress_total = self.stats["total_controls"]
for hint, controls in groups:
try:
await self._process_hint_group(hint, controls, dry_run)
self.stats["phase1_groups_processed"] += 1
except Exception as e:
logger.error("BatchDedup Phase 1 error on hint %s: %s", hint, e)
self.stats["errors"] += 1
try:
self.db.rollback()
except Exception:
pass
logger.info(
"BatchDedup Phase 1 done: %d masters, %d linked, %d review",
self.stats["masters"], self.stats["linked"], self.stats["review"],
)
# Phase 2: Cross-group dedup via embeddings
if not dry_run:
self._progress_phase = "phase2"
await self._run_cross_group_pass()
elapsed = time.monotonic() - start
self.stats["elapsed_seconds"] = round(elapsed, 1)
logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
return self.stats
def _load_merge_groups(self, hint_filter: str = None) -> list:
"""Load all Pass 0b controls grouped by merge_group_hint, largest first."""
conditions = [
"decomposition_method = 'pass0b'",
"release_state != 'deprecated'",
"release_state != 'duplicate'",
]
params = {}
if hint_filter:
conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf")
params["hf"] = f"{hint_filter}%"
where = " AND ".join(conditions)
rows = self.db.execute(text(f"""
SELECT id::text, control_id, title, objective,
pattern_id, requirements::text, test_procedure::text,
evidence::text, release_state,
generation_metadata->>'merge_group_hint' as merge_group_hint,
generation_metadata->>'action_object_class' as action_object_class
FROM canonical_controls
WHERE {where}
ORDER BY control_id
"""), params).fetchall()
by_hint = defaultdict(list)
for r in rows:
by_hint[r[9] or ""].append({
"uuid": r[0],
"control_id": r[1],
"title": r[2],
"objective": r[3],
"pattern_id": r[4],
"requirements": r[5],
"test_procedure": r[6],
"evidence": r[7],
"release_state": r[8],
"merge_group_hint": r[9] or "",
"action_object_class": r[10] or "",
})
self.stats["total_controls"] = len(rows)
self.stats["unique_hints"] = len(by_hint)
sorted_groups = sorted(by_hint.items(), key=lambda x: len(x[1]), reverse=True)
logger.info("BatchDedup loaded %d controls in %d hint groups",
len(rows), len(sorted_groups))
return sorted_groups
def _sub_group_by_merge_hint(self, controls: list) -> dict:
"""Group controls by merge_group_hint composite key."""
groups = defaultdict(list)
for c in controls:
hint = c["merge_group_hint"]
if hint:
groups[hint].append(c)
else:
groups[f"__no_hint_{c['uuid']}"].append(c)
return dict(groups)
async def _process_hint_group(
self,
hint: str,
controls: list,
dry_run: bool,
):
"""Process all controls sharing the same merge_group_hint.
Within a hint group, all controls share action+object+trigger.
The best-quality control becomes master, rest are linked as duplicates.
"""
if len(controls) < 2:
# Singleton → always master
self.stats["masters"] += 1
if not dry_run:
await self._embed_and_index(controls[0])
self._progress_count += 1
self._log_progress(hint)
return
# Sort by quality score (best first)
sorted_group = sorted(controls, key=quality_score, reverse=True)
master = sorted_group[0]
self.stats["masters"] += 1
if not dry_run:
await self._embed_and_index(master)
for candidate in sorted_group[1:]:
# All share the same hint → check title similarity
if candidate["title"].strip().lower() == master["title"].strip().lower():
# Identical title → direct link (no embedding needed)
self.stats["linked"] += 1
self.stats["skipped_title_identical"] += 1
if not dry_run:
await self._mark_duplicate(master, candidate, confidence=1.0)
else:
# Different title within same hint → still likely duplicate
# Use embedding to verify
await self._check_and_link_within_group(master, candidate, dry_run)
self._progress_count += 1
self._log_progress(hint)
async def _check_and_link_within_group(
self,
master: dict,
candidate: dict,
dry_run: bool,
):
"""Check if candidate (same hint group) is duplicate of master via embedding."""
parts = candidate["merge_group_hint"].split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
canonical = canonicalize_text(action, obj, candidate["title"])
embedding = await get_embedding(canonical)
if not embedding:
# Can't embed → link anyway (same hint = same action+object)
self.stats["linked"] += 1
if not dry_run:
await self._mark_duplicate(master, candidate, confidence=0.90)
return
# Search the dedup collection (unfiltered — pattern_id is NULL)
results = await qdrant_search_cross_regulation(
embedding, top_k=3, collection=self.collection,
)
if not results:
# No Qdrant matches yet (master might not be indexed yet) → link to master
self.stats["linked"] += 1
if not dry_run:
await self._mark_duplicate(master, candidate, confidence=0.90)
return
best = results[0]
best_score = best.get("score", 0.0)
best_payload = best.get("payload", {})
best_uuid = best_payload.get("control_uuid", "")
if best_score > LINK_THRESHOLD:
self.stats["linked"] += 1
if not dry_run:
await self._mark_duplicate_to(best_uuid, candidate, confidence=best_score)
elif best_score > REVIEW_THRESHOLD:
self.stats["review"] += 1
if not dry_run:
self._write_review(candidate, best_payload, best_score)
else:
# Very different despite same hint → new master
self.stats["new_controls"] += 1
if not dry_run:
await self._index_with_embedding(candidate, embedding)
async def _run_cross_group_pass(self):
"""Phase 2: Find cross-group duplicates among surviving masters.
After Phase 1, ~52k masters remain. Many have similar semantics
despite different merge_group_hints (e.g. different German spellings).
This pass embeds all masters and finds near-duplicates via Qdrant.
"""
logger.info("BatchDedup Phase 2: Cross-group pass starting...")
rows = self.db.execute(text("""
SELECT id::text, control_id, title,
generation_metadata->>'merge_group_hint' as merge_group_hint
FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate'
AND release_state != 'deprecated'
ORDER BY control_id
""")).fetchall()
self._progress_total = len(rows)
self._progress_count = 0
logger.info("BatchDedup Cross-group: %d masters to check", len(rows))
cross_linked = 0
cross_review = 0
for i, r in enumerate(rows):
uuid = r[0]
hint = r[3] or ""
parts = hint.split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
canonical = canonicalize_text(action, obj, r[2])
embedding = await get_embedding(canonical)
if not embedding:
continue
results = await qdrant_search_cross_regulation(
embedding, top_k=5, collection=self.collection,
)
if not results:
continue
# Find best match from a DIFFERENT hint group
for match in results:
match_score = match.get("score", 0.0)
match_payload = match.get("payload", {})
match_uuid = match_payload.get("control_uuid", "")
# Skip self-match
if match_uuid == uuid:
continue
# Must be a different hint group (otherwise already handled in Phase 1)
match_action = match_payload.get("action_normalized", "")
match_object = match_payload.get("object_normalized", "")
# Simple check: different control UUID is enough
if match_score > LINK_THRESHOLD:
# Mark the worse one as duplicate
try:
self.db.execute(text("""
UPDATE canonical_controls
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
WHERE id = CAST(:dup AS uuid)
AND release_state != 'duplicate'
"""), {"master": match_uuid, "dup": uuid})
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence)
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {"cu": match_uuid, "pu": uuid, "conf": match_score})
# Transfer parent links
transferred = self._transfer_parent_links(match_uuid, uuid)
self.stats["parent_links_transferred"] += transferred
self.db.commit()
cross_linked += 1
except Exception as e:
logger.error("BatchDedup cross-group link error %s%s: %s",
uuid, match_uuid, e)
self.db.rollback()
self.stats["errors"] += 1
break # Only one cross-link per control
elif match_score > REVIEW_THRESHOLD:
self._write_review(
{"control_id": r[1], "title": r[2], "objective": "",
"merge_group_hint": hint, "pattern_id": None},
match_payload, match_score,
)
cross_review += 1
break
self._progress_count = i + 1
if (i + 1) % 500 == 0:
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
i + 1, len(rows), cross_linked, cross_review)
self.stats["cross_group_linked"] = cross_linked
self.stats["cross_group_review"] = cross_review
logger.info("BatchDedup Cross-group complete: %d linked, %d review",
cross_linked, cross_review)
# ── Qdrant Helpers ───────────────────────────────────────────────────
async def _embed_and_index(self, control: dict):
"""Compute embedding and index a control in the dedup Qdrant collection."""
parts = control["merge_group_hint"].split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
canonical = canonicalize_text(action, obj, control["title"])
embedding = await get_embedding(canonical)
if not embedding:
return
await qdrant_upsert(
point_id=control["uuid"],
embedding=embedding,
payload={
"control_uuid": control["uuid"],
"control_id": control["control_id"],
"title": control["title"],
"pattern_id": control.get("pattern_id"),
"action_normalized": norm_action,
"object_normalized": norm_object,
"canonical_text": canonical,
"merge_group_hint": control["merge_group_hint"],
},
collection=self.collection,
)
async def _index_with_embedding(self, control: dict, embedding: list):
"""Index a control with a pre-computed embedding."""
parts = control["merge_group_hint"].split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
canonical = canonicalize_text(action, obj, control["title"])
await qdrant_upsert(
point_id=control["uuid"],
embedding=embedding,
payload={
"control_uuid": control["uuid"],
"control_id": control["control_id"],
"title": control["title"],
"pattern_id": control.get("pattern_id"),
"action_normalized": norm_action,
"object_normalized": norm_object,
"canonical_text": canonical,
"merge_group_hint": control["merge_group_hint"],
},
collection=self.collection,
)
# ── DB Write Helpers ─────────────────────────────────────────────────
async def _mark_duplicate(self, master: dict, candidate: dict, confidence: float):
"""Mark candidate as duplicate of master, transfer parent links."""
try:
self.db.execute(text("""
UPDATE canonical_controls
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
WHERE id = CAST(:cand AS uuid)
"""), {"master": master["uuid"], "cand": candidate["uuid"]})
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence)
VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {"master": master["uuid"], "cand_parent": candidate["uuid"], "conf": confidence})
transferred = self._transfer_parent_links(master["uuid"], candidate["uuid"])
self.stats["parent_links_transferred"] += transferred
self.db.commit()
except Exception as e:
logger.error("BatchDedup _mark_duplicate error %s%s: %s",
candidate["uuid"], master["uuid"], e)
self.db.rollback()
raise
async def _mark_duplicate_to(self, master_uuid: str, candidate: dict, confidence: float):
"""Mark candidate as duplicate of a Qdrant-matched master."""
try:
self.db.execute(text("""
UPDATE canonical_controls
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
WHERE id = CAST(:cand AS uuid)
"""), {"master": master_uuid, "cand": candidate["uuid"]})
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence)
VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {"master": master_uuid, "cand_parent": candidate["uuid"], "conf": confidence})
transferred = self._transfer_parent_links(master_uuid, candidate["uuid"])
self.stats["parent_links_transferred"] += transferred
self.db.commit()
except Exception as e:
logger.error("BatchDedup _mark_duplicate_to error %s%s: %s",
candidate["uuid"], master_uuid, e)
self.db.rollback()
raise
def _transfer_parent_links(self, master_uuid: str, duplicate_uuid: str) -> int:
"""Move existing parent links from duplicate to master."""
rows = self.db.execute(text("""
SELECT parent_control_uuid::text, link_type, confidence,
source_regulation, source_article, obligation_candidate_id::text
FROM control_parent_links
WHERE control_uuid = CAST(:dup AS uuid)
AND link_type = 'decomposition'
"""), {"dup": duplicate_uuid}).fetchall()
transferred = 0
for r in rows:
parent_uuid = r[0]
if parent_uuid == master_uuid:
continue
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence,
source_regulation, source_article, obligation_candidate_id)
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), :lt, :conf,
:sr, :sa, CAST(:oci AS uuid))
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {
"cu": master_uuid,
"pu": parent_uuid,
"lt": r[1],
"conf": float(r[2]) if r[2] else 1.0,
"sr": r[3],
"sa": r[4],
"oci": r[5],
})
transferred += 1
return transferred
def _write_review(self, candidate: dict, matched_payload: dict, score: float):
"""Write a dedup review entry for borderline matches."""
try:
self.db.execute(text("""
INSERT INTO control_dedup_reviews
(candidate_control_id, candidate_title, candidate_objective,
matched_control_uuid, matched_control_id,
similarity_score, dedup_stage, dedup_details)
VALUES (:ccid, :ct, :co, CAST(:mcu AS uuid), :mci,
:ss, 'batch_dedup', CAST(:dd AS jsonb))
"""), {
"ccid": candidate["control_id"],
"ct": candidate["title"],
"co": candidate.get("objective", ""),
"mcu": matched_payload.get("control_uuid"),
"mci": matched_payload.get("control_id"),
"ss": score,
"dd": json.dumps({
"merge_group_hint": candidate.get("merge_group_hint", ""),
"pattern_id": candidate.get("pattern_id"),
}),
})
self.db.commit()
except Exception as e:
logger.error("BatchDedup _write_review error: %s", e)
self.db.rollback()
raise
# ── Progress ─────────────────────────────────────────────────────────
def _log_progress(self, hint: str):
"""Log progress every 500 controls."""
if self._progress_count > 0 and self._progress_count % 500 == 0:
logger.info(
"BatchDedup [%s] %d/%d — masters=%d, linked=%d, review=%d",
self._progress_phase, self._progress_count, self._progress_total,
self.stats["masters"], self.stats["linked"], self.stats["review"],
)
def get_status(self) -> dict:
"""Return current progress stats (for status endpoint)."""
return {
"phase": self._progress_phase,
"progress": self._progress_count,
"total": self._progress_total,
**self.stats,
}

View File

@@ -0,0 +1,438 @@
"""
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
3-tier matching strategy:
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
"""
import hashlib
import json
import logging
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
import httpx
from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
ALL_COLLECTIONS = [
"bp_compliance_ce",
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_dsfa_corpus",
"bp_legal_templates",
]
BACKFILL_SYSTEM_PROMPT = (
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
)
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
_SOURCE_ARTICLE_RE = re.compile(
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
)
@dataclass
class MatchResult:
article: str
paragraph: str
method: str # "hash", "regex", "llm"
@dataclass
class BackfillResult:
total_controls: int = 0
matched_hash: int = 0
matched_regex: int = 0
matched_llm: int = 0
unmatched: int = 0
updated: int = 0
errors: list = field(default_factory=list)
class CitationBackfill:
"""Backfill article/paragraph into existing control source_citations."""
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
self.db = db
self.rag = rag_client
self._rag_index: dict[str, RAGSearchResult] = {}
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
result = BackfillResult()
# Load controls needing backfill
controls = self._load_controls_needing_backfill(limit)
result.total_controls = len(controls)
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
if not controls:
return result
# Collect hashes we need to find — only build index for controls with source text
needed_hashes: set[str] = set()
for ctrl in controls:
src = ctrl.get("source_original_text")
if src:
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
if needed_hashes:
# Build targeted RAG index — only scroll collections that our controls reference
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
await self._build_rag_index_targeted(controls)
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
else:
logger.info("No source_original_text found — skipping RAG index build")
# Process each control
for i, ctrl in enumerate(controls):
if i > 0 and i % 100 == 0:
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
try:
match = await self._match_control(ctrl)
if match:
if match.method == "hash":
result.matched_hash += 1
elif match.method == "regex":
result.matched_regex += 1
elif match.method == "llm":
result.matched_llm += 1
if not dry_run:
self._update_control(ctrl, match)
result.updated += 1
else:
logger.debug(
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
ctrl["control_id"], match.article, match.paragraph, match.method,
)
else:
result.unmatched += 1
except Exception as e:
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
logger.error(error_msg)
result.errors.append(error_msg)
if not dry_run:
try:
self.db.commit()
except Exception as e:
logger.error("Backfill commit failed: %s", e)
result.errors.append(f"Commit failed: {e}")
logger.info(
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
result.total_controls, result.matched_hash, result.matched_regex,
result.matched_llm, result.unmatched, result.updated,
)
return result
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
"""Load controls where source_citation exists but lacks separate 'article' key."""
query = """
SELECT id, control_id, source_citation, source_original_text,
generation_metadata, license_rule
FROM canonical_controls
WHERE license_rule IN (1, 2)
AND source_citation IS NOT NULL
AND (
source_citation->>'article' IS NULL
OR source_citation->>'article' = ''
)
ORDER BY control_id
"""
if limit > 0:
query += f" LIMIT {limit}"
result = self.db.execute(text(query))
cols = result.keys()
controls = []
for row in result:
ctrl = dict(zip(cols, row))
ctrl["id"] = str(ctrl["id"])
# Parse JSON fields
for jf in ("source_citation", "generation_metadata"):
if isinstance(ctrl.get(jf), str):
try:
ctrl[jf] = json.loads(ctrl[jf])
except (json.JSONDecodeError, TypeError):
ctrl[jf] = {}
controls.append(ctrl)
return controls
async def _build_rag_index_targeted(self, controls: list[dict]):
"""Build RAG index by scrolling only collections relevant to our controls.
Uses regulation codes from generation_metadata to identify which collections
to search, falling back to all collections only if needed.
"""
# Determine which collections are relevant based on regulation codes
regulation_to_collection = self._map_regulations_to_collections(controls)
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
logger.info("Targeted index: searching %d collections: %s",
len(collections_to_search), ", ".join(collections_to_search))
for collection in collections_to_search:
offset = None
page = 0
seen_offsets: set[str] = set()
while True:
chunks, next_offset = await self.rag.scroll(
collection=collection, offset=offset, limit=200,
)
if not chunks:
break
for chunk in chunks:
if chunk.text and len(chunk.text.strip()) >= 50:
h = hashlib.sha256(chunk.text.encode()).hexdigest()
self._rag_index[h] = chunk
page += 1
if page % 50 == 0:
logger.info("Indexing %s: page %d (%d chunks so far)",
collection, page, len(self._rag_index))
if not next_offset:
break
if next_offset in seen_offsets:
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
break
seen_offsets.add(next_offset)
offset = next_offset
logger.info("Indexed collection %s: %d pages", collection, page)
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
"""Map regulation codes from controls to likely Qdrant collections."""
# Heuristic: regulation code prefix → collection
collection_map = {
"eu_": "bp_compliance_gesetze",
"dsgvo": "bp_compliance_datenschutz",
"bdsg": "bp_compliance_gesetze",
"ttdsg": "bp_compliance_gesetze",
"nist_": "bp_compliance_ce",
"owasp": "bp_compliance_ce",
"bsi_": "bp_compliance_ce",
"enisa": "bp_compliance_ce",
"at_": "bp_compliance_recht",
"fr_": "bp_compliance_recht",
"es_": "bp_compliance_recht",
}
result: dict[str, str] = {}
for ctrl in controls:
meta = ctrl.get("generation_metadata") or {}
reg = meta.get("source_regulation", "")
if not reg:
continue
for prefix, coll in collection_map.items():
if reg.startswith(prefix):
result[reg] = coll
break
else:
# Unknown regulation — search all
for coll in ALL_COLLECTIONS:
result[f"_all_{coll}"] = coll
return result
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
"""3-tier matching: hash → regex → LLM."""
# Tier 1: Hash match against RAG index
source_text = ctrl.get("source_original_text")
if source_text:
h = hashlib.sha256(source_text.encode()).hexdigest()
chunk = self._rag_index.get(h)
if chunk and (chunk.article or chunk.paragraph):
return MatchResult(
article=chunk.article or "",
paragraph=chunk.paragraph or "",
method="hash",
)
# Tier 2: Regex parse concatenated source
citation = ctrl.get("source_citation") or {}
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed and parsed["article"]:
return MatchResult(
article=parsed["article"],
paragraph="", # Regex can't extract paragraph from concatenated format
method="regex",
)
# Tier 3: Ollama LLM
if source_text:
return await self._llm_match(ctrl)
return None
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
"""Use Ollama to identify article/paragraph from source text."""
citation = ctrl.get("source_citation") or {}
regulation_name = citation.get("source", "")
metadata = ctrl.get("generation_metadata") or {}
regulation_code = metadata.get("source_regulation", "")
source_text = ctrl.get("source_original_text", "")
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
Gesetz: {regulation_name} (Code: {regulation_code})
Text:
---
{source_text[:2000]}
---
Antworte NUR mit JSON:
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
Falls kein Artikel erkennbar ist, setze article auf "".
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
try:
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
data = _parse_json(raw)
if data and (data.get("article") or data.get("paragraph")):
return MatchResult(
article=data.get("article", ""),
paragraph=data.get("paragraph", ""),
method="llm",
)
except Exception as e:
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
return None
def _update_control(self, ctrl: dict, match: MatchResult):
"""Update source_citation and generation_metadata in DB."""
citation = ctrl.get("source_citation") or {}
# Clean the source name: remove concatenated article if present
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed:
citation["source"] = parsed["name"]
# Add separate article/paragraph fields
citation["article"] = match.article
citation["paragraph"] = match.paragraph
# Update generation_metadata
metadata = ctrl.get("generation_metadata") or {}
if match.article:
metadata["source_article"] = match.article
metadata["source_paragraph"] = match.paragraph
metadata["backfill_method"] = match.method
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
self.db.execute(
text("""
UPDATE canonical_controls
SET source_citation = :citation,
generation_metadata = :metadata,
updated_at = NOW()
WHERE id = CAST(:id AS uuid)
"""),
{
"id": ctrl["id"],
"citation": json.dumps(citation),
"metadata": json.dumps(metadata),
},
)
def _parse_concatenated_source(source: str) -> Optional[dict]:
"""Parse 'DSGVO Art. 35'{name: 'DSGVO', article: 'Art. 35'}.
Also handles '§' format: 'BDSG § 42'{name: 'BDSG', article: '§ 42'}.
"""
if not source:
return None
# Try Art./Artikel pattern
m = _SOURCE_ARTICLE_RE.match(source)
if m:
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
# Try § pattern
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
if m2:
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
return None
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"""Call Ollama chat API for backfill matching."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 256},
"think": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
if resp.status_code != 200:
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
return ""
data = resp.json()
msg = data.get("message", {})
if isinstance(msg, dict):
return msg.get("content", "")
return data.get("response", str(msg))
except Exception as e:
logger.error("Ollama backfill request failed: %s", e)
return ""
def _parse_json(raw: str) -> Optional[dict]:
"""Extract JSON object from LLM output."""
if not raw:
return None
# Try direct parse
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
if m:
try:
return json.loads(m.group(1))
except json.JSONDecodeError:
pass
# Try finding first { ... }
m = re.search(r"\{[^{}]*\}", raw)
if m:
try:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
return None

View File

@@ -0,0 +1,546 @@
"""Control Composer — Pattern + Obligation → Master Control.
Takes an obligation (from ObligationExtractor) and a matched control pattern
(from PatternMatcher), then uses LLM to compose a structured, actionable
Master Control. Replaces the old Stage 3 (STRUCTURE/REFORM) with a
pattern-guided approach.
Three composition modes based on license rules:
Rule 1: Obligation + Pattern + original text → full control
Rule 2: Obligation + Pattern + original text + citation → control
Rule 3: Obligation + Pattern (NO original text) → reformulated control
Fallback: No pattern match → basic generation (tagged needs_pattern_assignment)
Part of the Multi-Layer Control Architecture (Phase 6 of 8).
"""
import json
import logging
import os
from dataclasses import dataclass, field
from typing import Optional
from compliance.services.obligation_extractor import (
ObligationMatch,
_llm_ollama,
_parse_json,
)
from compliance.services.pattern_matcher import (
ControlPattern,
PatternMatchResult,
)
logger = logging.getLogger(__name__)
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
# Valid values for generated control fields
VALID_SEVERITIES = {"low", "medium", "high", "critical"}
VALID_EFFORTS = {"s", "m", "l", "xl"}
VALID_VERIFICATION = {"code_review", "document", "tool", "hybrid"}
@dataclass
class ComposedControl:
"""A Master Control composed from an obligation + pattern."""
# Core fields (match canonical_controls schema)
control_id: str = ""
title: str = ""
objective: str = ""
rationale: str = ""
scope: dict = field(default_factory=dict)
requirements: list = field(default_factory=list)
test_procedure: list = field(default_factory=list)
evidence: list = field(default_factory=list)
severity: str = "medium"
risk_score: float = 5.0
implementation_effort: str = "m"
open_anchors: list = field(default_factory=list)
release_state: str = "draft"
tags: list = field(default_factory=list)
# 3-Rule License fields
license_rule: Optional[int] = None
source_original_text: Optional[str] = None
source_citation: Optional[dict] = None
customer_visible: bool = True
# Classification
verification_method: Optional[str] = None
category: Optional[str] = None
target_audience: Optional[list] = None
# Pattern + Obligation linkage
pattern_id: Optional[str] = None
obligation_ids: list = field(default_factory=list)
# Metadata
generation_metadata: dict = field(default_factory=dict)
composition_method: str = "pattern_guided" # pattern_guided | fallback
def to_dict(self) -> dict:
"""Serialize for DB storage or API response."""
return {
"control_id": self.control_id,
"title": self.title,
"objective": self.objective,
"rationale": self.rationale,
"scope": self.scope,
"requirements": self.requirements,
"test_procedure": self.test_procedure,
"evidence": self.evidence,
"severity": self.severity,
"risk_score": self.risk_score,
"implementation_effort": self.implementation_effort,
"open_anchors": self.open_anchors,
"release_state": self.release_state,
"tags": self.tags,
"license_rule": self.license_rule,
"source_original_text": self.source_original_text,
"source_citation": self.source_citation,
"customer_visible": self.customer_visible,
"verification_method": self.verification_method,
"category": self.category,
"target_audience": self.target_audience,
"pattern_id": self.pattern_id,
"obligation_ids": self.obligation_ids,
"generation_metadata": self.generation_metadata,
"composition_method": self.composition_method,
}
class ControlComposer:
"""Composes Master Controls from obligations + patterns.
Usage::
composer = ControlComposer()
control = await composer.compose(
obligation=obligation_match,
pattern_result=pattern_match_result,
chunk_text="...",
license_rule=1,
source_citation={...},
)
"""
async def compose(
self,
obligation: ObligationMatch,
pattern_result: PatternMatchResult,
chunk_text: Optional[str] = None,
license_rule: int = 3,
source_citation: Optional[dict] = None,
regulation_code: Optional[str] = None,
) -> ComposedControl:
"""Compose a Master Control from obligation + pattern.
Args:
obligation: The extracted obligation (from ObligationExtractor).
pattern_result: The matched pattern (from PatternMatcher).
chunk_text: Original RAG chunk text (only used for Rules 1-2).
license_rule: 1=free, 2=citation, 3=restricted.
source_citation: Citation metadata for Rule 2.
regulation_code: Source regulation code.
Returns:
ComposedControl ready for storage.
"""
pattern = pattern_result.pattern if pattern_result else None
if pattern:
control = await self._compose_with_pattern(
obligation, pattern, chunk_text, license_rule, source_citation,
)
else:
control = await self._compose_fallback(
obligation, chunk_text, license_rule, source_citation,
)
# Set linkage fields
control.pattern_id = pattern.id if pattern else None
if obligation.obligation_id:
control.obligation_ids = [obligation.obligation_id]
# Set license fields
control.license_rule = license_rule
if license_rule in (1, 2) and chunk_text:
control.source_original_text = chunk_text
if license_rule == 2 and source_citation:
control.source_citation = source_citation
if license_rule == 3:
control.customer_visible = False
control.source_original_text = None
control.source_citation = None
# Build metadata
control.generation_metadata = {
"composition_method": control.composition_method,
"pattern_id": control.pattern_id,
"pattern_confidence": round(pattern_result.confidence, 3) if pattern_result else 0,
"pattern_method": pattern_result.method if pattern_result else "none",
"obligation_id": obligation.obligation_id,
"obligation_method": obligation.method,
"obligation_confidence": round(obligation.confidence, 3),
"license_rule": license_rule,
"regulation_code": regulation_code,
}
# Validate and fix fields
_validate_control(control)
return control
async def compose_batch(
self,
items: list[dict],
) -> list[ComposedControl]:
"""Compose multiple controls.
Args:
items: List of dicts with keys: obligation, pattern_result,
chunk_text, license_rule, source_citation, regulation_code.
Returns:
List of ComposedControl instances.
"""
results = []
for item in items:
control = await self.compose(
obligation=item["obligation"],
pattern_result=item.get("pattern_result", PatternMatchResult()),
chunk_text=item.get("chunk_text"),
license_rule=item.get("license_rule", 3),
source_citation=item.get("source_citation"),
regulation_code=item.get("regulation_code"),
)
results.append(control)
return results
# -----------------------------------------------------------------------
# Pattern-guided composition
# -----------------------------------------------------------------------
async def _compose_with_pattern(
self,
obligation: ObligationMatch,
pattern: ControlPattern,
chunk_text: Optional[str],
license_rule: int,
source_citation: Optional[dict],
) -> ComposedControl:
"""Use LLM to fill the pattern template with obligation-specific details."""
prompt = _build_compose_prompt(obligation, pattern, chunk_text, license_rule)
system_prompt = _compose_system_prompt(license_rule)
llm_result = await _llm_ollama(prompt, system_prompt)
if not llm_result:
return self._compose_from_template(obligation, pattern)
parsed = _parse_json(llm_result)
if not parsed:
return self._compose_from_template(obligation, pattern)
control = ComposedControl(
title=parsed.get("title", pattern.name_de)[:255],
objective=parsed.get("objective", pattern.objective_template),
rationale=parsed.get("rationale", pattern.rationale_template),
requirements=_ensure_list(parsed.get("requirements", pattern.requirements_template)),
test_procedure=_ensure_list(parsed.get("test_procedure", pattern.test_procedure_template)),
evidence=_ensure_list(parsed.get("evidence", pattern.evidence_template)),
severity=parsed.get("severity", pattern.severity_default),
implementation_effort=parsed.get("implementation_effort", pattern.implementation_effort_default),
category=parsed.get("category", pattern.category),
tags=_ensure_list(parsed.get("tags", pattern.tags)),
target_audience=_ensure_list(parsed.get("target_audience", [])),
verification_method=parsed.get("verification_method"),
open_anchors=_anchors_from_pattern(pattern),
composition_method="pattern_guided",
)
return control
def _compose_from_template(
self,
obligation: ObligationMatch,
pattern: ControlPattern,
) -> ComposedControl:
"""Fallback: fill template directly without LLM (when LLM fails)."""
obl_title = obligation.obligation_title or ""
obl_text = obligation.obligation_text or ""
title = f"{pattern.name_de}"
if obl_title:
title = f"{pattern.name_de}{obl_title}"
objective = pattern.objective_template
if obl_text and len(obl_text) > 20:
objective = f"{pattern.objective_template} Bezug: {obl_text[:200]}"
return ComposedControl(
title=title[:255],
objective=objective,
rationale=pattern.rationale_template,
requirements=list(pattern.requirements_template),
test_procedure=list(pattern.test_procedure_template),
evidence=list(pattern.evidence_template),
severity=pattern.severity_default,
implementation_effort=pattern.implementation_effort_default,
category=pattern.category,
tags=list(pattern.tags),
open_anchors=_anchors_from_pattern(pattern),
composition_method="template_only",
)
# -----------------------------------------------------------------------
# Fallback (no pattern)
# -----------------------------------------------------------------------
async def _compose_fallback(
self,
obligation: ObligationMatch,
chunk_text: Optional[str],
license_rule: int,
source_citation: Optional[dict],
) -> ComposedControl:
"""Generate a control without a pattern template (old-style)."""
prompt = _build_fallback_prompt(obligation, chunk_text, license_rule)
system_prompt = _compose_system_prompt(license_rule)
llm_result = await _llm_ollama(prompt, system_prompt)
parsed = _parse_json(llm_result) if llm_result else {}
obl_text = obligation.obligation_text or ""
control = ComposedControl(
title=parsed.get("title", obl_text[:100] if obl_text else "Untitled Control")[:255],
objective=parsed.get("objective", obl_text[:500]),
rationale=parsed.get("rationale", "Aus gesetzlicher Pflicht abgeleitet."),
requirements=_ensure_list(parsed.get("requirements", [])),
test_procedure=_ensure_list(parsed.get("test_procedure", [])),
evidence=_ensure_list(parsed.get("evidence", [])),
severity=parsed.get("severity", "medium"),
implementation_effort=parsed.get("implementation_effort", "m"),
category=parsed.get("category"),
tags=_ensure_list(parsed.get("tags", [])),
target_audience=_ensure_list(parsed.get("target_audience", [])),
verification_method=parsed.get("verification_method"),
composition_method="fallback",
release_state="needs_review",
)
return control
# ---------------------------------------------------------------------------
# Prompt builders
# ---------------------------------------------------------------------------
def _compose_system_prompt(license_rule: int) -> str:
"""Build the system prompt based on license rule."""
if license_rule == 3:
return (
"Du bist ein Security-Compliance-Experte. Deine Aufgabe ist es, "
"eigenstaendige Security Controls zu formulieren. "
"Du formulierst IMMER in eigenen Worten. "
"KOPIERE KEINE Saetze aus dem Quelltext. "
"Verwende eigene Begriffe und Struktur. "
"NENNE NICHT die Quelle. Keine proprietaeren Bezeichner. "
"Antworte NUR mit validem JSON."
)
return (
"Du bist ein Security-Compliance-Experte. "
"Erstelle ein praxisorientiertes, umsetzbares Security Control. "
"Antworte NUR mit validem JSON."
)
def _build_compose_prompt(
obligation: ObligationMatch,
pattern: ControlPattern,
chunk_text: Optional[str],
license_rule: int,
) -> str:
"""Build the LLM prompt for pattern-guided composition."""
obl_section = _obligation_section(obligation)
pattern_section = _pattern_section(pattern)
if license_rule == 3:
context_section = "KONTEXT: Intern analysiert (keine Quellenangabe)."
elif chunk_text:
context_section = f"KONTEXT (Originaltext):\n{chunk_text[:2000]}"
else:
context_section = "KONTEXT: Kein Originaltext verfuegbar."
return f"""Erstelle ein PRAXISORIENTIERTES Security Control.
{obl_section}
{pattern_section}
{context_section}
AUFGABE:
Fuelle das Muster mit pflicht-spezifischen Details.
Das Ergebnis muss UMSETZBAR sein — keine Gesetzesparaphrase.
Formuliere konkret und handlungsorientiert.
Antworte als JSON:
{{
"title": "Kurzer praegnanter Titel (max 100 Zeichen, deutsch)",
"objective": "Was soll erreicht werden? (1-3 Saetze)",
"rationale": "Warum ist das wichtig? (1-2 Saetze)",
"requirements": ["Konkrete Anforderung 1", "Anforderung 2", ...],
"test_procedure": ["Pruefschritt 1", "Pruefschritt 2", ...],
"evidence": ["Nachweis 1", "Nachweis 2", ...],
"severity": "low|medium|high|critical",
"implementation_effort": "s|m|l|xl",
"category": "{pattern.category}",
"tags": ["tag1", "tag2"],
"target_audience": ["unternehmen", "behoerden", "entwickler"],
"verification_method": "code_review|document|tool|hybrid"
}}"""
def _build_fallback_prompt(
obligation: ObligationMatch,
chunk_text: Optional[str],
license_rule: int,
) -> str:
"""Build the LLM prompt for fallback composition (no pattern)."""
obl_section = _obligation_section(obligation)
if license_rule == 3:
context_section = "KONTEXT: Intern analysiert (keine Quellenangabe)."
elif chunk_text:
context_section = f"KONTEXT (Originaltext):\n{chunk_text[:2000]}"
else:
context_section = "KONTEXT: Kein Originaltext verfuegbar."
return f"""Erstelle ein Security Control aus der folgenden Pflicht.
{obl_section}
{context_section}
AUFGABE:
Formuliere ein umsetzbares Security Control.
Keine Gesetzesparaphrase — konkrete Massnahmen beschreiben.
Antworte als JSON:
{{
"title": "Kurzer praegnanter Titel (max 100 Zeichen, deutsch)",
"objective": "Was soll erreicht werden? (1-3 Saetze)",
"rationale": "Warum ist das wichtig? (1-2 Saetze)",
"requirements": ["Konkrete Anforderung 1", "Anforderung 2", ...],
"test_procedure": ["Pruefschritt 1", "Pruefschritt 2", ...],
"evidence": ["Nachweis 1", "Nachweis 2", ...],
"severity": "low|medium|high|critical",
"implementation_effort": "s|m|l|xl",
"category": "one of: authentication, encryption, data_protection, etc.",
"tags": ["tag1", "tag2"],
"target_audience": ["unternehmen"],
"verification_method": "code_review|document|tool|hybrid"
}}"""
def _obligation_section(obligation: ObligationMatch) -> str:
"""Format the obligation for the prompt."""
parts = ["PFLICHT (was das Gesetz verlangt):"]
if obligation.obligation_title:
parts.append(f" Titel: {obligation.obligation_title}")
if obligation.obligation_text:
parts.append(f" Beschreibung: {obligation.obligation_text[:500]}")
if obligation.obligation_id:
parts.append(f" ID: {obligation.obligation_id}")
if obligation.regulation_id:
parts.append(f" Rechtsgrundlage: {obligation.regulation_id}")
if not obligation.obligation_text and not obligation.obligation_title:
parts.append(" (Keine spezifische Pflicht extrahiert)")
return "\n".join(parts)
def _pattern_section(pattern: ControlPattern) -> str:
"""Format the pattern for the prompt."""
reqs = "\n ".join(f"- {r}" for r in pattern.requirements_template[:5])
tests = "\n ".join(f"- {t}" for t in pattern.test_procedure_template[:3])
return f"""MUSTER (wie man es typischerweise umsetzt):
Pattern: {pattern.name_de} ({pattern.id})
Domain: {pattern.domain}
Ziel-Template: {pattern.objective_template}
Anforderungs-Template:
{reqs}
Pruefverfahren-Template:
{tests}"""
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _ensure_list(value) -> list:
"""Ensure a value is a list of strings."""
if isinstance(value, list):
return [str(v) for v in value if v]
if isinstance(value, str):
return [value]
return []
def _anchors_from_pattern(pattern: ControlPattern) -> list:
"""Convert pattern's open_anchor_refs to control anchor format."""
anchors = []
for ref in pattern.open_anchor_refs:
anchors.append({
"framework": ref.get("framework", ""),
"control_id": ref.get("ref", ""),
"title": "",
"alignment_score": 0.8,
})
return anchors
def _validate_control(control: ComposedControl) -> None:
"""Validate and fix control field values."""
# Severity
if control.severity not in VALID_SEVERITIES:
control.severity = "medium"
# Implementation effort
if control.implementation_effort not in VALID_EFFORTS:
control.implementation_effort = "m"
# Verification method
if control.verification_method and control.verification_method not in VALID_VERIFICATION:
control.verification_method = None
# Risk score
if not (0 <= control.risk_score <= 10):
control.risk_score = _severity_to_risk(control.severity)
# Title length
if len(control.title) > 255:
control.title = control.title[:252] + "..."
# Ensure minimum content
if not control.objective:
control.objective = control.title
if not control.rationale:
control.rationale = "Aus regulatorischer Anforderung abgeleitet."
if not control.requirements:
control.requirements = ["Anforderung gemaess Pflichtbeschreibung umsetzen"]
if not control.test_procedure:
control.test_procedure = ["Umsetzung der Anforderungen pruefen"]
if not control.evidence:
control.evidence = ["Dokumentation der Umsetzung"]
def _severity_to_risk(severity: str) -> float:
"""Map severity to a default risk score."""
return {
"critical": 9.0,
"high": 7.0,
"medium": 5.0,
"low": 3.0,
}.get(severity, 5.0)

View File

@@ -0,0 +1,745 @@
"""Control Deduplication Engine — 4-Stage Matching Pipeline.
Prevents duplicate atomic controls during Pass 0b by checking candidates
against existing controls before insertion.
Stages:
1. Pattern-Gate: pattern_id must match (hard gate)
2. Action-Check: normalized action verb must match (hard gate)
3. Object-Norm: normalized object must match (soft gate with high threshold)
4. Embedding: cosine similarity with tiered thresholds (Qdrant)
Verdicts:
- NEW: create a new atomic control
- LINK: add parent link to existing control (similarity > LINK_THRESHOLD)
- REVIEW: queue for human review (REVIEW_THRESHOLD < sim < LINK_THRESHOLD)
"""
import logging
import os
import re
from dataclasses import dataclass, field
from typing import Optional, Callable, Awaitable
import httpx
logger = logging.getLogger(__name__)
# ── Configuration ────────────────────────────────────────────────────
DEDUP_ENABLED = os.getenv("DEDUP_ENABLED", "true").lower() == "true"
LINK_THRESHOLD = float(os.getenv("DEDUP_LINK_THRESHOLD", "0.92"))
REVIEW_THRESHOLD = float(os.getenv("DEDUP_REVIEW_THRESHOLD", "0.85"))
LINK_THRESHOLD_DIFF_OBJECT = float(os.getenv("DEDUP_LINK_THRESHOLD_DIFF_OBJ", "0.95"))
CROSS_REG_LINK_THRESHOLD = float(os.getenv("DEDUP_CROSS_REG_THRESHOLD", "0.95"))
QDRANT_COLLECTION = os.getenv("DEDUP_QDRANT_COLLECTION", "atomic_controls")
QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
# ── Result Dataclass ─────────────────────────────────────────────────
@dataclass
class DedupResult:
"""Outcome of the dedup check."""
verdict: str # "new" | "link" | "review"
matched_control_uuid: Optional[str] = None
matched_control_id: Optional[str] = None
matched_title: Optional[str] = None
stage: str = "" # which stage decided
similarity_score: float = 0.0
link_type: str = "dedup_merge" # "dedup_merge" | "cross_regulation"
details: dict = field(default_factory=dict)
# ── Action Normalization ─────────────────────────────────────────────
_ACTION_SYNONYMS: dict[str, str] = {
# German → canonical English
"implementieren": "implement",
"umsetzen": "implement",
"einrichten": "implement",
"einführen": "implement",
"aufbauen": "implement",
"bereitstellen": "implement",
"aktivieren": "implement",
"konfigurieren": "configure",
"einstellen": "configure",
"parametrieren": "configure",
"testen": "test",
"prüfen": "test",
"überprüfen": "test",
"verifizieren": "test",
"validieren": "test",
"kontrollieren": "test",
"auditieren": "audit",
"dokumentieren": "document",
"protokollieren": "log",
"aufzeichnen": "log",
"loggen": "log",
"überwachen": "monitor",
"monitoring": "monitor",
"beobachten": "monitor",
"schulen": "train",
"trainieren": "train",
"sensibilisieren": "train",
"löschen": "delete",
"entfernen": "delete",
"verschlüsseln": "encrypt",
"sperren": "block",
"beschränken": "restrict",
"einschränken": "restrict",
"begrenzen": "restrict",
"autorisieren": "authorize",
"genehmigen": "authorize",
"freigeben": "authorize",
"authentifizieren": "authenticate",
"identifizieren": "identify",
"melden": "report",
"benachrichtigen": "notify",
"informieren": "notify",
"aktualisieren": "update",
"erneuern": "update",
"sichern": "backup",
"wiederherstellen": "restore",
# English passthrough
"implement": "implement",
"configure": "configure",
"test": "test",
"verify": "test",
"validate": "test",
"audit": "audit",
"document": "document",
"log": "log",
"monitor": "monitor",
"train": "train",
"delete": "delete",
"encrypt": "encrypt",
"restrict": "restrict",
"authorize": "authorize",
"authenticate": "authenticate",
"report": "report",
"update": "update",
"backup": "backup",
"restore": "restore",
}
def normalize_action(action: str) -> str:
"""Normalize an action verb to a canonical English form."""
if not action:
return ""
action = action.strip().lower()
# Strip German infinitive/conjugation suffixes for lookup
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
# Try exact match first, then base form
if action in _ACTION_SYNONYMS:
return _ACTION_SYNONYMS[action]
if action_base in _ACTION_SYNONYMS:
return _ACTION_SYNONYMS[action_base]
# Fuzzy: check if action starts with any known verb
for verb, canonical in _ACTION_SYNONYMS.items():
if action.startswith(verb) or verb.startswith(action):
return canonical
return action # fallback: return as-is
# ── Object Normalization ─────────────────────────────────────────────
_OBJECT_SYNONYMS: dict[str, str] = {
# Authentication / Access
"mfa": "multi_factor_auth",
"multi-faktor-authentifizierung": "multi_factor_auth",
"mehrfaktorauthentifizierung": "multi_factor_auth",
"multi-factor authentication": "multi_factor_auth",
"two-factor": "multi_factor_auth",
"2fa": "multi_factor_auth",
"passwort": "password_policy",
"kennwort": "password_policy",
"password": "password_policy",
"zugangsdaten": "credentials",
"credentials": "credentials",
"admin-konten": "privileged_access",
"admin accounts": "privileged_access",
"administratorkonten": "privileged_access",
"privilegierte zugriffe": "privileged_access",
"privileged accounts": "privileged_access",
"remote-zugriff": "remote_access",
"fernzugriff": "remote_access",
"remote access": "remote_access",
"session": "session_management",
"sitzung": "session_management",
"sitzungsverwaltung": "session_management",
# Encryption
"verschlüsselung": "encryption",
"encryption": "encryption",
"kryptografie": "encryption",
"kryptografische verfahren": "encryption",
"schlüssel": "key_management",
"key management": "key_management",
"schlüsselverwaltung": "key_management",
"zertifikat": "certificate_management",
"certificate": "certificate_management",
"tls": "transport_encryption",
"ssl": "transport_encryption",
"https": "transport_encryption",
# Network
"firewall": "firewall",
"netzwerk": "network_security",
"network": "network_security",
"vpn": "vpn",
"segmentierung": "network_segmentation",
"segmentation": "network_segmentation",
# Logging / Monitoring
"audit-log": "audit_logging",
"audit log": "audit_logging",
"protokoll": "audit_logging",
"logging": "audit_logging",
"monitoring": "monitoring",
"überwachung": "monitoring",
"alerting": "alerting",
"alarmierung": "alerting",
"siem": "siem",
# Data
"personenbezogene daten": "personal_data",
"personal data": "personal_data",
"sensible daten": "sensitive_data",
"sensitive data": "sensitive_data",
"datensicherung": "backup",
"backup": "backup",
"wiederherstellung": "disaster_recovery",
"disaster recovery": "disaster_recovery",
# Policy / Process
"richtlinie": "policy",
"policy": "policy",
"verfahrensanweisung": "procedure",
"procedure": "procedure",
"prozess": "process",
"schulung": "training",
"training": "training",
"awareness": "awareness",
"sensibilisierung": "awareness",
# Incident
"vorfall": "incident",
"incident": "incident",
"sicherheitsvorfall": "security_incident",
"security incident": "security_incident",
# Vulnerability
"schwachstelle": "vulnerability",
"vulnerability": "vulnerability",
"patch": "patch_management",
"update": "patch_management",
"patching": "patch_management",
}
# Precompile for substring matching (longest first)
_OBJECT_KEYS_SORTED = sorted(_OBJECT_SYNONYMS.keys(), key=len, reverse=True)
def normalize_object(obj: str) -> str:
"""Normalize a compliance object to a canonical token."""
if not obj:
return ""
obj_lower = obj.strip().lower()
# Exact match
if obj_lower in _OBJECT_SYNONYMS:
return _OBJECT_SYNONYMS[obj_lower]
# Substring match (longest first)
for phrase in _OBJECT_KEYS_SORTED:
if phrase in obj_lower:
return _OBJECT_SYNONYMS[phrase]
# Fallback: strip articles/prepositions, join with underscore
cleaned = re.sub(r"\b(der|die|das|den|dem|des|ein|eine|eines|einem|einen"
r"|für|von|zu|auf|in|an|bei|mit|nach|über|unter|the|a|an"
r"|for|of|to|on|in|at|by|with)\b", "", obj_lower)
tokens = [t for t in cleaned.split() if len(t) > 2]
return "_".join(tokens[:4]) if tokens else obj_lower.replace(" ", "_")
# ── Canonicalization ─────────────────────────────────────────────────
def canonicalize_text(action: str, obj: str, title: str = "") -> str:
"""Build a canonical English text for embedding.
Transforms German compliance text into normalized English tokens
for more stable embedding comparisons.
"""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
# Build canonical sentence
parts = [norm_action, norm_object]
if title:
# Add title keywords (stripped of common filler)
title_clean = re.sub(
r"\b(und|oder|für|von|zu|der|die|das|den|dem|des|ein|eine"
r"|bei|mit|nach|gemäß|gem\.|laut|entsprechend)\b",
"", title.lower()
)
title_tokens = [t for t in title_clean.split() if len(t) > 3][:5]
if title_tokens:
parts.append("for")
parts.extend(title_tokens)
return " ".join(parts)
# ── Embedding Helper ─────────────────────────────────────────────────
async def get_embedding(text: str) -> list[float]:
"""Get embedding vector for a single text via embedding service."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": [text]},
)
embeddings = resp.json().get("embeddings", [])
return embeddings[0] if embeddings else []
except Exception as e:
logger.warning("Embedding failed: %s", e)
return []
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
# ── Qdrant Helpers ───────────────────────────────────────────────────
async def qdrant_search(
embedding: list[float],
pattern_id: str,
top_k: int = 10,
collection: Optional[str] = None,
) -> list[dict]:
"""Search Qdrant for similar atomic controls, filtered by pattern_id."""
if not embedding:
return []
coll = collection or QDRANT_COLLECTION
body: dict = {
"vector": embedding,
"limit": top_k,
"with_payload": True,
"filter": {
"must": [
{"key": "pattern_id", "match": {"value": pattern_id}}
]
},
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{coll}/points/search",
json=body,
)
if resp.status_code != 200:
logger.warning("Qdrant search failed: %d", resp.status_code)
return []
return resp.json().get("result", [])
except Exception as e:
logger.warning("Qdrant search error: %s", e)
return []
async def qdrant_search_cross_regulation(
embedding: list[float],
top_k: int = 5,
collection: Optional[str] = None,
) -> list[dict]:
"""Search Qdrant for similar controls across ALL regulations (no pattern_id filter).
Used for cross-regulation linking (e.g. DSGVO Art. 25 ↔ NIS2 Art. 21).
"""
if not embedding:
return []
coll = collection or QDRANT_COLLECTION
body: dict = {
"vector": embedding,
"limit": top_k,
"with_payload": True,
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{coll}/points/search",
json=body,
)
if resp.status_code != 200:
logger.warning("Qdrant cross-reg search failed: %d", resp.status_code)
return []
return resp.json().get("result", [])
except Exception as e:
logger.warning("Qdrant cross-reg search error: %s", e)
return []
async def qdrant_upsert(
point_id: str,
embedding: list[float],
payload: dict,
collection: Optional[str] = None,
) -> bool:
"""Upsert a single point into a Qdrant collection."""
if not embedding:
return False
coll = collection or QDRANT_COLLECTION
body = {
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload,
}]
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.put(
f"{QDRANT_URL}/collections/{coll}/points",
json=body,
)
return resp.status_code == 200
except Exception as e:
logger.warning("Qdrant upsert error: %s", e)
return False
async def ensure_qdrant_collection(
vector_size: int = 1024,
collection: Optional[str] = None,
) -> bool:
"""Create a Qdrant collection if it doesn't exist (idempotent)."""
coll = collection or QDRANT_COLLECTION
try:
async with httpx.AsyncClient(timeout=10.0) as client:
# Check if exists
resp = await client.get(f"{QDRANT_URL}/collections/{coll}")
if resp.status_code == 200:
return True
# Create
resp = await client.put(
f"{QDRANT_URL}/collections/{coll}",
json={
"vectors": {"size": vector_size, "distance": "Cosine"},
},
)
if resp.status_code == 200:
logger.info("Created Qdrant collection: %s", coll)
# Create payload indexes
for field_name in ["pattern_id", "action_normalized", "object_normalized", "control_id"]:
await client.put(
f"{QDRANT_URL}/collections/{coll}/index",
json={"field_name": field_name, "field_schema": "keyword"},
)
return True
logger.error("Failed to create Qdrant collection: %d", resp.status_code)
return False
except Exception as e:
logger.warning("Qdrant collection check error: %s", e)
return False
# ── Main Dedup Checker ───────────────────────────────────────────────
class ControlDedupChecker:
"""4-stage dedup checker for atomic controls.
Usage:
checker = ControlDedupChecker(db_session)
result = await checker.check_duplicate(candidate_action, candidate_object, candidate_title, pattern_id)
if result.verdict == "link":
checker.add_parent_link(result.matched_control_uuid, parent_uuid)
elif result.verdict == "review":
checker.write_review(candidate, result)
else:
# Insert new control
"""
def __init__(
self,
db,
embed_fn: Optional[Callable[[str], Awaitable[list[float]]]] = None,
search_fn: Optional[Callable] = None,
):
self.db = db
self._embed = embed_fn or get_embedding
self._search = search_fn or qdrant_search
self._cache: dict[str, list[dict]] = {} # pattern_id → existing controls
def _load_existing(self, pattern_id: str) -> list[dict]:
"""Load existing atomic controls with same pattern_id from DB."""
if pattern_id in self._cache:
return self._cache[pattern_id]
from sqlalchemy import text
rows = self.db.execute(text("""
SELECT id::text, control_id, title, objective,
pattern_id,
generation_metadata->>'obligation_type' as obligation_type
FROM canonical_controls
WHERE parent_control_uuid IS NOT NULL
AND release_state != 'deprecated'
AND pattern_id = :pid
"""), {"pid": pattern_id}).fetchall()
result = [
{
"uuid": r[0], "control_id": r[1], "title": r[2],
"objective": r[3], "pattern_id": r[4],
"obligation_type": r[5],
}
for r in rows
]
self._cache[pattern_id] = result
return result
async def check_duplicate(
self,
action: str,
obj: str,
title: str,
pattern_id: Optional[str],
) -> DedupResult:
"""Run the 4-stage dedup pipeline + cross-regulation linking.
Returns DedupResult with verdict: new/link/review.
"""
# No pattern_id → can't dedup meaningfully
if not pattern_id:
return DedupResult(verdict="new", stage="no_pattern")
# Stage 1: Pattern-Gate
existing = self._load_existing(pattern_id)
if not existing:
return DedupResult(
verdict="new", stage="pattern_gate",
details={"reason": "no existing controls with this pattern_id"},
)
# Stage 2: Action-Check
norm_action = normalize_action(action)
# We don't have action stored on existing controls from DB directly,
# so we use embedding for controls that passed pattern gate.
# But we CAN check via generation_metadata if available.
# Stage 3: Object-Normalization
norm_object = normalize_object(obj)
# Stage 4: Embedding Similarity
canonical = canonicalize_text(action, obj, title)
embedding = await self._embed(canonical)
if not embedding:
# Can't compute embedding → default to new
return DedupResult(
verdict="new", stage="embedding_unavailable",
details={"canonical_text": canonical},
)
# Search Qdrant
results = await self._search(embedding, pattern_id, top_k=5)
if not results:
# No intra-pattern matches → try cross-regulation
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="no_qdrant_matches",
details={"canonical_text": canonical, "action": norm_action, "object": norm_object},
))
# Evaluate best match
best = results[0]
best_score = best.get("score", 0.0)
best_payload = best.get("payload", {})
best_action = best_payload.get("action_normalized", "")
best_object = best_payload.get("object_normalized", "")
# Action differs → NEW (even if embedding is high)
if best_action and norm_action and best_action != norm_action:
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="action_mismatch",
similarity_score=best_score,
matched_control_id=best_payload.get("control_id"),
details={
"candidate_action": norm_action,
"existing_action": best_action,
"similarity": best_score,
},
))
# Object differs → use higher threshold
if best_object and norm_object and best_object != norm_object:
if best_score > LINK_THRESHOLD_DIFF_OBJECT:
return DedupResult(
verdict="link", stage="embedding_diff_object",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
details={"candidate_object": norm_object, "existing_object": best_object},
)
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="object_mismatch_below_threshold",
similarity_score=best_score,
matched_control_id=best_payload.get("control_id"),
details={
"candidate_object": norm_object,
"existing_object": best_object,
"threshold": LINK_THRESHOLD_DIFF_OBJECT,
},
))
# Same action + same object → tiered thresholds
if best_score > LINK_THRESHOLD:
return DedupResult(
verdict="link", stage="embedding_match",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
)
if best_score > REVIEW_THRESHOLD:
return DedupResult(
verdict="review", stage="embedding_review",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
)
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="embedding_below_threshold",
similarity_score=best_score,
details={"threshold": REVIEW_THRESHOLD},
))
async def _check_cross_regulation(
self,
embedding: list[float],
intra_result: DedupResult,
) -> DedupResult:
"""Second pass: cross-regulation linking for controls deemed 'new'.
Searches Qdrant WITHOUT pattern_id filter. Uses a higher threshold
(0.95) to avoid false positives across regulation boundaries.
"""
if intra_result.verdict != "new" or not embedding:
return intra_result
cross_results = await qdrant_search_cross_regulation(embedding, top_k=5)
if not cross_results:
return intra_result
best = cross_results[0]
best_score = best.get("score", 0.0)
if best_score > CROSS_REG_LINK_THRESHOLD:
best_payload = best.get("payload", {})
return DedupResult(
verdict="link",
stage="cross_regulation",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
link_type="cross_regulation",
details={
"cross_reg_score": best_score,
"cross_reg_threshold": CROSS_REG_LINK_THRESHOLD,
},
)
return intra_result
def add_parent_link(
self,
control_uuid: str,
parent_control_uuid: str,
link_type: str = "dedup_merge",
confidence: float = 0.0,
source_regulation: Optional[str] = None,
source_article: Optional[str] = None,
obligation_candidate_id: Optional[str] = None,
) -> None:
"""Add a parent link to an existing atomic control."""
from sqlalchemy import text
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence,
source_regulation, source_article, obligation_candidate_id)
VALUES (:cu, :pu, :lt, :conf, :sr, :sa, :oci::uuid)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {
"cu": control_uuid,
"pu": parent_control_uuid,
"lt": link_type,
"conf": confidence,
"sr": source_regulation,
"sa": source_article,
"oci": obligation_candidate_id,
})
self.db.commit()
def write_review(
self,
candidate_control_id: str,
candidate_title: str,
candidate_objective: str,
result: DedupResult,
parent_control_uuid: Optional[str] = None,
obligation_candidate_id: Optional[str] = None,
) -> None:
"""Write a dedup review queue entry."""
from sqlalchemy import text
self.db.execute(text("""
INSERT INTO control_dedup_reviews
(candidate_control_id, candidate_title, candidate_objective,
matched_control_uuid, matched_control_id,
similarity_score, dedup_stage, dedup_details,
parent_control_uuid, obligation_candidate_id)
VALUES (:ccid, :ct, :co, :mcu::uuid, :mci, :ss, :ds,
:dd::jsonb, :pcu::uuid, :oci)
"""), {
"ccid": candidate_control_id,
"ct": candidate_title,
"co": candidate_objective,
"mcu": result.matched_control_uuid,
"mci": result.matched_control_id,
"ss": result.similarity_score,
"ds": result.stage,
"dd": __import__("json").dumps(result.details),
"pcu": parent_control_uuid,
"oci": obligation_candidate_id,
})
self.db.commit()
async def index_control(
self,
control_uuid: str,
control_id: str,
title: str,
action: str,
obj: str,
pattern_id: str,
collection: Optional[str] = None,
) -> bool:
"""Index a new atomic control in Qdrant for future dedup checks."""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
canonical = canonicalize_text(action, obj, title)
embedding = await self._embed(canonical)
if not embedding:
return False
return await qdrant_upsert(
point_id=control_uuid,
embedding=embedding,
payload={
"control_uuid": control_uuid,
"control_id": control_id,
"title": title,
"pattern_id": pattern_id,
"action_normalized": norm_action,
"object_normalized": norm_object,
"canonical_text": canonical,
},
collection=collection,
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,152 @@
"""
Control Status Transition State Machine.
Enforces that controls cannot be set to "pass" without sufficient evidence.
Prevents Compliance-Theater where controls claim compliance without real proof.
Transition rules:
planned → in_progress : always allowed
in_progress → pass : requires ≥1 evidence with confidence ≥ E2 and
truth_status in (uploaded, observed, validated_internal)
in_progress → partial : requires ≥1 evidence (any level)
pass → fail : always allowed (degradation)
any → n/a : requires status_justification
any → planned : always allowed (reset)
"""
from typing import List, Optional, Tuple
from ..db.models import EvidenceDB
# Confidence level ordering for comparisons
CONFIDENCE_ORDER = {"E0": 0, "E1": 1, "E2": 2, "E3": 3, "E4": 4}
# Truth statuses that qualify as "real" evidence for pass transitions
VALID_TRUTH_STATUSES = {"uploaded", "observed", "validated_internal", "accepted_by_auditor", "provided_to_auditor"}
def validate_transition(
current_status: str,
new_status: str,
evidence_list: Optional[List[EvidenceDB]] = None,
status_justification: Optional[str] = None,
bypass_for_auto_updater: bool = False,
) -> Tuple[bool, List[str]]:
"""
Validate whether a control status transition is allowed.
Args:
current_status: Current control status value (e.g. "planned", "pass")
new_status: Requested new status
evidence_list: List of EvidenceDB objects linked to this control
status_justification: Text justification (required for n/a transitions)
bypass_for_auto_updater: If True, skip evidence checks (used by CI/CD auto-updater
which creates evidence atomically with status change)
Returns:
Tuple of (allowed: bool, violations: list[str])
"""
violations: List[str] = []
evidence_list = evidence_list or []
# Same status → no-op, always allowed
if current_status == new_status:
return True, []
# Reset to planned is always allowed
if new_status == "planned":
return True, []
# n/a requires justification
if new_status == "n/a":
if not status_justification or not status_justification.strip():
violations.append("Transition to 'n/a' requires a status_justification explaining why this control is not applicable.")
return len(violations) == 0, violations
# Degradation: pass → fail is always allowed
if current_status == "pass" and new_status == "fail":
return True, []
# planned → in_progress: always allowed
if current_status == "planned" and new_status == "in_progress":
return True, []
# in_progress → partial: needs at least 1 evidence
if new_status == "partial":
if not bypass_for_auto_updater and len(evidence_list) == 0:
violations.append("Transition to 'partial' requires at least 1 evidence record.")
return len(violations) == 0, violations
# in_progress → pass: strict requirements
if new_status == "pass":
if bypass_for_auto_updater:
return True, []
if len(evidence_list) == 0:
violations.append("Transition to 'pass' requires at least 1 evidence record.")
return False, violations
# Check for at least one qualifying evidence
has_qualifying = False
for e in evidence_list:
conf = getattr(e, "confidence_level", None)
truth = getattr(e, "truth_status", None)
# Get string values from enum or string
conf_val = conf.value if hasattr(conf, "value") else str(conf) if conf else "E1"
truth_val = truth.value if hasattr(truth, "value") else str(truth) if truth else "uploaded"
if CONFIDENCE_ORDER.get(conf_val, 1) >= CONFIDENCE_ORDER["E2"] and truth_val in VALID_TRUTH_STATUSES:
has_qualifying = True
break
if not has_qualifying:
violations.append(
"Transition to 'pass' requires at least 1 evidence with confidence >= E2 "
"and truth_status in (uploaded, observed, validated_internal, accepted_by_auditor). "
"Current evidence does not meet this threshold."
)
return len(violations) == 0, violations
# in_progress → fail: always allowed
if new_status == "fail":
return True, []
# Any other transition from planned/fail to pass requires going through in_progress
if current_status in ("planned", "fail") and new_status == "pass":
if bypass_for_auto_updater:
return True, []
violations.append(
f"Direct transition from '{current_status}' to 'pass' is not allowed. "
f"Move to 'in_progress' first, then to 'pass' with qualifying evidence."
)
return False, violations
# Default: allow other transitions (e.g. fail → partial, partial → pass)
# For partial → pass, apply the same evidence checks
if current_status == "partial" and new_status == "pass":
if bypass_for_auto_updater:
return True, []
has_qualifying = False
for e in evidence_list:
conf = getattr(e, "confidence_level", None)
truth = getattr(e, "truth_status", None)
conf_val = conf.value if hasattr(conf, "value") else str(conf) if conf else "E1"
truth_val = truth.value if hasattr(truth, "value") else str(truth) if truth else "uploaded"
if CONFIDENCE_ORDER.get(conf_val, 1) >= CONFIDENCE_ORDER["E2"] and truth_val in VALID_TRUTH_STATUSES:
has_qualifying = True
break
if not has_qualifying:
violations.append(
"Transition from 'partial' to 'pass' requires at least 1 evidence with confidence >= E2 "
"and truth_status in (uploaded, observed, validated_internal, accepted_by_auditor)."
)
return len(violations) == 0, violations
# All other transitions allowed
return True, []

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,714 @@
"""Framework Decomposition Engine — decomposes framework-container obligations.
Sits between Pass 0a (obligation extraction) and Pass 0b (atomic control
composition). Detects obligations that reference a framework domain (e.g.
"CCM-Praktiken fuer AIS") and decomposes them into concrete sub-obligations
using an internal framework registry.
Three routing types:
atomic → pass through to Pass 0b unchanged
compound → split compound verbs, then Pass 0b
framework_container → decompose via registry, then Pass 0b
The registry is a set of JSON files under compliance/data/frameworks/.
"""
import json
import logging
import os
import re
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Registry loading
# ---------------------------------------------------------------------------
_REGISTRY_DIR = Path(__file__).resolve().parent.parent / "data" / "frameworks"
_REGISTRY: dict[str, dict] = {} # framework_id → framework dict
def _load_registry() -> dict[str, dict]:
"""Load all framework JSON files from the registry directory."""
registry: dict[str, dict] = {}
if not _REGISTRY_DIR.is_dir():
logger.warning("Framework registry dir not found: %s", _REGISTRY_DIR)
return registry
for fpath in sorted(_REGISTRY_DIR.glob("*.json")):
try:
with open(fpath, encoding="utf-8") as f:
fw = json.load(f)
fw_id = fw.get("framework_id", fpath.stem)
registry[fw_id] = fw
logger.info(
"Loaded framework: %s (%d domains)",
fw_id,
len(fw.get("domains", [])),
)
except Exception:
logger.exception("Failed to load framework file: %s", fpath)
return registry
def get_registry() -> dict[str, dict]:
"""Return the global framework registry (lazy-loaded)."""
global _REGISTRY
if not _REGISTRY:
_REGISTRY = _load_registry()
return _REGISTRY
def reload_registry() -> dict[str, dict]:
"""Force-reload the framework registry from disk."""
global _REGISTRY
_REGISTRY = _load_registry()
return _REGISTRY
# ---------------------------------------------------------------------------
# Framework alias index (built from registry)
# ---------------------------------------------------------------------------
def _build_alias_index(registry: dict[str, dict]) -> dict[str, str]:
"""Build a lowercase alias → framework_id lookup."""
idx: dict[str, str] = {}
for fw_id, fw in registry.items():
# Framework-level aliases
idx[fw_id.lower()] = fw_id
name = fw.get("display_name", "")
if name:
idx[name.lower()] = fw_id
# Common short forms
for part in fw_id.lower().replace("_", " ").split():
if len(part) >= 3:
idx[part] = fw_id
return idx
# ---------------------------------------------------------------------------
# Routing — classify obligation type
# ---------------------------------------------------------------------------
# Extended patterns for framework detection (beyond the simple _COMPOSITE_RE
# in decomposition_pass.py — here we also capture the framework name)
_FRAMEWORK_PATTERN = re.compile(
r"(?:praktiken|kontrollen|ma(?:ss|ß)nahmen|anforderungen|vorgaben|controls|practices|measures|requirements)"
r"\s+(?:f(?:ue|ü)r|aus|gem(?:ae|ä)(?:ss|ß)|nach|from|of|for|per)\s+"
r"(.+?)(?:\s+(?:m(?:ue|ü)ssen|sollen|sind|werden|implementieren|umsetzen|einf(?:ue|ü)hren)|\.|,|$)",
re.IGNORECASE,
)
# Direct framework name references
_DIRECT_FRAMEWORK_RE = re.compile(
r"\b(?:CSA\s*CCM|NIST\s*(?:SP\s*)?800-53|OWASP\s*(?:ASVS|SAMM|Top\s*10)"
r"|CIS\s*Controls|BSI\s*(?:IT-)?Grundschutz|ENISA|ISO\s*2700[12]"
r"|COBIT|SOX|PCI\s*DSS|HITRUST|SOC\s*2|KRITIS)\b",
re.IGNORECASE,
)
# Compound verb patterns (multiple main verbs)
_COMPOUND_VERB_RE = re.compile(
r"\b(?:und|sowie|als\s+auch|or|and)\b",
re.IGNORECASE,
)
# No-split phrases that look compound but aren't
_NO_SPLIT_PHRASES = [
"pflegen und aufrechterhalten",
"dokumentieren und pflegen",
"definieren und dokumentieren",
"erstellen und freigeben",
"pruefen und genehmigen",
"identifizieren und bewerten",
"erkennen und melden",
"define and maintain",
"create and maintain",
"establish and maintain",
"monitor and review",
"detect and respond",
]
@dataclass
class RoutingResult:
"""Result of obligation routing classification."""
routing_type: str # atomic | compound | framework_container | unknown_review
framework_ref: Optional[str] = None
framework_domain: Optional[str] = None
domain_title: Optional[str] = None
confidence: float = 0.0
reason: str = ""
def classify_routing(
obligation_text: str,
action_raw: str,
object_raw: str,
condition_raw: Optional[str] = None,
) -> RoutingResult:
"""Classify an obligation into atomic / compound / framework_container."""
combined = f"{obligation_text} {object_raw}".lower()
# --- Step 1: Framework container detection ---
fw_result = _detect_framework(obligation_text, object_raw)
if fw_result.routing_type == "framework_container":
return fw_result
# --- Step 2: Compound verb detection ---
if _is_compound_obligation(action_raw, obligation_text):
return RoutingResult(
routing_type="compound",
confidence=0.7,
reason="multiple_main_verbs",
)
# --- Step 3: Default = atomic ---
return RoutingResult(
routing_type="atomic",
confidence=0.9,
reason="single_action_single_object",
)
def _detect_framework(
obligation_text: str, object_raw: str,
) -> RoutingResult:
"""Detect if obligation references a framework domain."""
combined = f"{obligation_text} {object_raw}"
registry = get_registry()
alias_idx = _build_alias_index(registry)
# Strategy 1: direct framework name match
m = _DIRECT_FRAMEWORK_RE.search(combined)
if m:
fw_name = m.group(0).strip()
fw_id = _resolve_framework_id(fw_name, alias_idx, registry)
if fw_id:
domain_id, domain_title = _match_domain(
combined, registry[fw_id],
)
return RoutingResult(
routing_type="framework_container",
framework_ref=fw_id,
framework_domain=domain_id,
domain_title=domain_title,
confidence=0.95 if domain_id else 0.75,
reason=f"direct_framework_match:{fw_name}",
)
else:
# Framework name recognized but not in registry
return RoutingResult(
routing_type="framework_container",
framework_ref=None,
framework_domain=None,
confidence=0.6,
reason=f"direct_framework_match_no_registry:{fw_name}",
)
# Strategy 2: pattern match ("Praktiken fuer X")
m2 = _FRAMEWORK_PATTERN.search(combined)
if m2:
ref_text = m2.group(1).strip()
fw_id, domain_id, domain_title = _resolve_from_ref_text(
ref_text, registry, alias_idx,
)
if fw_id:
return RoutingResult(
routing_type="framework_container",
framework_ref=fw_id,
framework_domain=domain_id,
domain_title=domain_title,
confidence=0.85 if domain_id else 0.65,
reason=f"pattern_match:{ref_text}",
)
# Strategy 3: keyword-heavy object
if _has_framework_keywords(object_raw):
return RoutingResult(
routing_type="framework_container",
framework_ref=None,
framework_domain=None,
confidence=0.5,
reason="framework_keywords_in_object",
)
return RoutingResult(routing_type="atomic", confidence=0.0)
def _resolve_framework_id(
name: str,
alias_idx: dict[str, str],
registry: dict[str, dict],
) -> Optional[str]:
"""Resolve a framework name to its registry ID."""
normalized = re.sub(r"\s+", " ", name.strip().lower())
# Direct alias match
if normalized in alias_idx:
return alias_idx[normalized]
# Try compact form (strip spaces, hyphens, underscores)
compact = re.sub(r"[\s_\-]+", "", normalized)
for alias, fw_id in alias_idx.items():
if re.sub(r"[\s_\-]+", "", alias) == compact:
return fw_id
# Substring match in display names
for fw_id, fw in registry.items():
display = fw.get("display_name", "").lower()
if normalized in display or display in normalized:
return fw_id
# Partial match: check if normalized contains any alias (for multi-word refs)
for alias, fw_id in alias_idx.items():
if len(alias) >= 4 and alias in normalized:
return fw_id
return None
def _match_domain(
text: str, framework: dict,
) -> tuple[Optional[str], Optional[str]]:
"""Match a domain within a framework from text references."""
text_lower = text.lower()
best_id: Optional[str] = None
best_title: Optional[str] = None
best_score = 0
for domain in framework.get("domains", []):
score = 0
domain_id = domain["domain_id"]
title = domain.get("title", "")
# Exact domain ID match (e.g. "AIS")
if re.search(rf"\b{re.escape(domain_id)}\b", text, re.IGNORECASE):
score += 10
# Full title match
if title.lower() in text_lower:
score += 8
# Alias match
for alias in domain.get("aliases", []):
if alias.lower() in text_lower:
score += 6
break
# Keyword overlap
kw_hits = sum(
1 for kw in domain.get("keywords", [])
if kw.lower() in text_lower
)
score += kw_hits
if score > best_score:
best_score = score
best_id = domain_id
best_title = title
if best_score >= 3:
return best_id, best_title
return None, None
def _resolve_from_ref_text(
ref_text: str,
registry: dict[str, dict],
alias_idx: dict[str, str],
) -> tuple[Optional[str], Optional[str], Optional[str]]:
"""Resolve framework + domain from a reference text like 'AIS' or 'Application Security'."""
ref_lower = ref_text.lower()
for fw_id, fw in registry.items():
for domain in fw.get("domains", []):
# Check domain ID
if domain["domain_id"].lower() in ref_lower:
return fw_id, domain["domain_id"], domain.get("title")
# Check title
if domain.get("title", "").lower() in ref_lower:
return fw_id, domain["domain_id"], domain.get("title")
# Check aliases
for alias in domain.get("aliases", []):
if alias.lower() in ref_lower or ref_lower in alias.lower():
return fw_id, domain["domain_id"], domain.get("title")
return None, None, None
_FRAMEWORK_KW_SET = {
"praktiken", "kontrollen", "massnahmen", "maßnahmen",
"anforderungen", "vorgaben", "framework", "standard",
"baseline", "katalog", "domain", "family", "category",
"practices", "controls", "measures", "requirements",
}
def _has_framework_keywords(text: str) -> bool:
"""Check if text contains framework-indicator keywords."""
words = set(re.findall(r"[a-zäöüß]+", text.lower()))
return len(words & _FRAMEWORK_KW_SET) >= 2
def _is_compound_obligation(action_raw: str, obligation_text: str) -> bool:
"""Detect if the obligation has multiple competing main verbs."""
if not action_raw:
return False
action_lower = action_raw.lower().strip()
# Check no-split phrases first
for phrase in _NO_SPLIT_PHRASES:
if phrase in action_lower:
return False
# Must have a conjunction
if not _COMPOUND_VERB_RE.search(action_lower):
return False
# Split by conjunctions and check if we get 2+ meaningful verbs
parts = re.split(r"\b(?:und|sowie|als\s+auch|or|and)\b", action_lower)
meaningful = [p.strip() for p in parts if len(p.strip()) >= 3]
return len(meaningful) >= 2
# ---------------------------------------------------------------------------
# Framework Decomposition
# ---------------------------------------------------------------------------
@dataclass
class DecomposedObligation:
"""A concrete obligation derived from a framework container."""
obligation_candidate_id: str
parent_control_id: str
parent_framework_container_id: str
source_ref_law: str
source_ref_article: str
obligation_text: str
actor: str
action_raw: str
object_raw: str
condition_raw: Optional[str] = None
trigger_raw: Optional[str] = None
routing_type: str = "atomic"
release_state: str = "decomposed"
subcontrol_id: str = ""
# Metadata
action_hint: str = ""
object_hint: str = ""
object_class: str = ""
keywords: list[str] = field(default_factory=list)
@dataclass
class FrameworkDecompositionResult:
"""Result of framework decomposition."""
framework_container_id: str
source_obligation_candidate_id: str
framework_ref: Optional[str]
framework_domain: Optional[str]
domain_title: Optional[str]
matched_subcontrols: list[str]
decomposition_confidence: float
release_state: str # decomposed | unmatched | error
decomposed_obligations: list[DecomposedObligation]
issues: list[str]
def decompose_framework_container(
obligation_candidate_id: str,
parent_control_id: str,
obligation_text: str,
framework_ref: Optional[str],
framework_domain: Optional[str],
actor: str = "organization",
) -> FrameworkDecompositionResult:
"""Decompose a framework-container obligation into concrete sub-obligations.
Steps:
1. Resolve framework from registry
2. Resolve domain within framework
3. Select relevant subcontrols (keyword filter or full domain)
4. Generate decomposed obligations
"""
container_id = f"FWC-{uuid.uuid4().hex[:8]}"
registry = get_registry()
issues: list[str] = []
# Step 1: Resolve framework
fw = None
if framework_ref and framework_ref in registry:
fw = registry[framework_ref]
else:
# Try to find by name in text
fw, framework_ref = _find_framework_in_text(obligation_text, registry)
if not fw:
issues.append("ERROR: framework_not_matched")
return FrameworkDecompositionResult(
framework_container_id=container_id,
source_obligation_candidate_id=obligation_candidate_id,
framework_ref=framework_ref,
framework_domain=framework_domain,
domain_title=None,
matched_subcontrols=[],
decomposition_confidence=0.0,
release_state="unmatched",
decomposed_obligations=[],
issues=issues,
)
# Step 2: Resolve domain
domain_data = None
domain_title = None
if framework_domain:
for d in fw.get("domains", []):
if d["domain_id"].lower() == framework_domain.lower():
domain_data = d
domain_title = d.get("title")
break
if not domain_data:
# Try matching from text
domain_id, domain_title = _match_domain(obligation_text, fw)
if domain_id:
for d in fw.get("domains", []):
if d["domain_id"] == domain_id:
domain_data = d
framework_domain = domain_id
break
if not domain_data:
issues.append("WARN: domain_not_matched — using all domains")
# Fall back to all subcontrols across all domains
all_subcontrols = []
for d in fw.get("domains", []):
for sc in d.get("subcontrols", []):
sc["_domain_id"] = d["domain_id"]
all_subcontrols.append(sc)
subcontrols = _select_subcontrols(obligation_text, all_subcontrols)
if not subcontrols:
issues.append("ERROR: no_subcontrols_matched")
return FrameworkDecompositionResult(
framework_container_id=container_id,
source_obligation_candidate_id=obligation_candidate_id,
framework_ref=framework_ref,
framework_domain=framework_domain,
domain_title=None,
matched_subcontrols=[],
decomposition_confidence=0.0,
release_state="unmatched",
decomposed_obligations=[],
issues=issues,
)
else:
# Step 3: Select subcontrols from domain
raw_subcontrols = domain_data.get("subcontrols", [])
subcontrols = _select_subcontrols(obligation_text, raw_subcontrols)
if not subcontrols:
# Full domain decomposition
subcontrols = raw_subcontrols
# Quality check: too many subcontrols
if len(subcontrols) > 25:
issues.append(f"WARN: {len(subcontrols)} subcontrols — may be too broad")
# Step 4: Generate decomposed obligations
display_name = fw.get("display_name", framework_ref or "Unknown")
decomposed: list[DecomposedObligation] = []
matched_ids: list[str] = []
for sc in subcontrols:
sc_id = sc.get("subcontrol_id", "")
matched_ids.append(sc_id)
action_hint = sc.get("action_hint", "")
object_hint = sc.get("object_hint", "")
# Quality warnings
if not action_hint:
issues.append(f"WARN: {sc_id} missing action_hint")
if not object_hint:
issues.append(f"WARN: {sc_id} missing object_hint")
obl_id = f"{obligation_candidate_id}-{sc_id}"
decomposed.append(DecomposedObligation(
obligation_candidate_id=obl_id,
parent_control_id=parent_control_id,
parent_framework_container_id=container_id,
source_ref_law=display_name,
source_ref_article=sc_id,
obligation_text=sc.get("statement", ""),
actor=actor,
action_raw=action_hint or _infer_action(sc.get("statement", "")),
object_raw=object_hint or _infer_object(sc.get("statement", "")),
routing_type="atomic",
release_state="decomposed",
subcontrol_id=sc_id,
action_hint=action_hint,
object_hint=object_hint,
object_class=sc.get("object_class", ""),
keywords=sc.get("keywords", []),
))
# Check if decomposed are identical to container
for d in decomposed:
if d.obligation_text.strip() == obligation_text.strip():
issues.append(f"WARN: {d.subcontrol_id} identical to container text")
confidence = _compute_decomposition_confidence(
framework_ref, framework_domain, domain_data, len(subcontrols), issues,
)
return FrameworkDecompositionResult(
framework_container_id=container_id,
source_obligation_candidate_id=obligation_candidate_id,
framework_ref=framework_ref,
framework_domain=framework_domain,
domain_title=domain_title,
matched_subcontrols=matched_ids,
decomposition_confidence=confidence,
release_state="decomposed",
decomposed_obligations=decomposed,
issues=issues,
)
def _find_framework_in_text(
text: str, registry: dict[str, dict],
) -> tuple[Optional[dict], Optional[str]]:
"""Try to find a framework by searching text for known names."""
alias_idx = _build_alias_index(registry)
m = _DIRECT_FRAMEWORK_RE.search(text)
if m:
fw_id = _resolve_framework_id(m.group(0), alias_idx, registry)
if fw_id and fw_id in registry:
return registry[fw_id], fw_id
return None, None
def _select_subcontrols(
obligation_text: str, subcontrols: list[dict],
) -> list[dict]:
"""Select relevant subcontrols based on keyword matching.
Returns empty list if no targeted match found (caller falls back to
full domain).
"""
text_lower = obligation_text.lower()
scored: list[tuple[int, dict]] = []
for sc in subcontrols:
score = 0
for kw in sc.get("keywords", []):
if kw.lower() in text_lower:
score += 1
# Title match
title = sc.get("title", "").lower()
if title and title in text_lower:
score += 3
# Object hint in text
obj = sc.get("object_hint", "").lower()
if obj and obj in text_lower:
score += 2
if score > 0:
scored.append((score, sc))
if not scored:
return []
# Only return those with meaningful overlap (score >= 2)
scored.sort(key=lambda x: x[0], reverse=True)
return [sc for score, sc in scored if score >= 2]
def _infer_action(statement: str) -> str:
"""Infer a basic action verb from a statement."""
s = statement.lower()
if any(w in s for w in ["definiert", "definieren", "define"]):
return "definieren"
if any(w in s for w in ["implementiert", "implementieren", "implement"]):
return "implementieren"
if any(w in s for w in ["dokumentiert", "dokumentieren", "document"]):
return "dokumentieren"
if any(w in s for w in ["ueberwacht", "ueberwachen", "monitor"]):
return "ueberwachen"
if any(w in s for w in ["getestet", "testen", "test"]):
return "testen"
if any(w in s for w in ["geschuetzt", "schuetzen", "protect"]):
return "implementieren"
if any(w in s for w in ["verwaltet", "verwalten", "manage"]):
return "pflegen"
if any(w in s for w in ["gemeldet", "melden", "report"]):
return "melden"
return "implementieren"
def _infer_object(statement: str) -> str:
"""Infer the primary object from a statement (first noun phrase)."""
# Simple heuristic: take the text after "muessen"/"muss" up to the verb
m = re.search(
r"(?:muessen|muss|m(?:ü|ue)ssen)\s+(.+?)(?:\s+werden|\s+sein|\.|,|$)",
statement,
re.IGNORECASE,
)
if m:
return m.group(1).strip()[:80]
# Fallback: first 80 chars
return statement[:80] if statement else ""
def _compute_decomposition_confidence(
framework_ref: Optional[str],
domain: Optional[str],
domain_data: Optional[dict],
num_subcontrols: int,
issues: list[str],
) -> float:
"""Compute confidence score for the decomposition."""
score = 0.3
if framework_ref:
score += 0.25
if domain:
score += 0.20
if domain_data:
score += 0.10
if 1 <= num_subcontrols <= 15:
score += 0.10
elif num_subcontrols > 15:
score += 0.05 # less confident with too many
# Penalize errors
errors = sum(1 for i in issues if i.startswith("ERROR:"))
score -= errors * 0.15
return round(max(min(score, 1.0), 0.0), 2)
# ---------------------------------------------------------------------------
# Registry statistics (for admin/debugging)
# ---------------------------------------------------------------------------
def registry_stats() -> dict:
"""Return summary statistics about the loaded registry."""
reg = get_registry()
stats = {
"frameworks": len(reg),
"details": [],
}
total_domains = 0
total_subcontrols = 0
for fw_id, fw in reg.items():
domains = fw.get("domains", [])
n_sc = sum(len(d.get("subcontrols", [])) for d in domains)
total_domains += len(domains)
total_subcontrols += n_sc
stats["details"].append({
"framework_id": fw_id,
"display_name": fw.get("display_name", ""),
"domains": len(domains),
"subcontrols": n_sc,
})
stats["total_domains"] = total_domains
stats["total_subcontrols"] = total_subcontrols
return stats

View File

@@ -173,6 +173,7 @@ class LLMProviderType(str, Enum):
"""Supported LLM provider types."""
ANTHROPIC = "anthropic"
SELF_HOSTED = "self_hosted"
OLLAMA = "ollama" # Alias for self_hosted (Ollama-specific)
MOCK = "mock" # For testing
@@ -392,6 +393,7 @@ class SelfHostedProvider(LLMProvider):
"model": self.model,
"prompt": full_prompt,
"stream": False,
"think": False, # Disable thinking mode (qwen3.5 etc.)
"options": {}
}
@@ -549,7 +551,7 @@ def get_llm_config() -> LLMConfig:
vault_path="breakpilot/api_keys/anthropic",
env_var="ANTHROPIC_API_KEY"
)
elif provider_type == LLMProviderType.SELF_HOSTED:
elif provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
api_key = get_secret_from_vault_or_env(
vault_path="breakpilot/api_keys/self_hosted_llm",
env_var="SELF_HOSTED_LLM_KEY"
@@ -558,7 +560,7 @@ def get_llm_config() -> LLMConfig:
# Select model based on provider type
if provider_type == LLMProviderType.ANTHROPIC:
model = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514")
elif provider_type == LLMProviderType.SELF_HOSTED:
elif provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
model = os.getenv("SELF_HOSTED_LLM_MODEL", "qwen2.5:14b")
else:
model = "mock-model"
@@ -591,7 +593,7 @@ def get_llm_provider(config: Optional[LLMConfig] = None) -> LLMProvider:
return MockProvider(config)
return AnthropicProvider(config)
elif config.provider_type == LLMProviderType.SELF_HOSTED:
elif config.provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
if not config.base_url:
logger.warning("No self-hosted LLM URL found, using mock provider")
return MockProvider(config)

View File

@@ -0,0 +1,59 @@
"""Shared normative language patterns for assertion classification.
Extracted from decomposition_pass.py for reuse in the assertion engine.
"""
import re
_PFLICHT_SIGNALS = [
r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b",
r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b",
r"\bist\s+verpflichtet\b",
r"\bist\s+zu\s+\w+en\b", r"\bsind\s+zu\s+\w+en\b",
r"\bhat\s+zu\s+\w+en\b", r"\bhaben\s+zu\s+\w+en\b",
r"\bist\s+\w+zu\w+en\b", r"\bsind\s+\w+zu\w+en\b",
r"\bist\s+\w+\s+zu\s+\w+en\b", r"\bsind\s+\w+\s+zu\s+\w+en\b",
r"\bhat\s+\w+\s+zu\s+\w+en\b", r"\bhaben\s+\w+\s+zu\s+\w+en\b",
r"\bshall\b", r"\bmust\b", r"\brequired\b",
r"\b\w+zuteilen\b", r"\b\w+zuwenden\b", r"\b\w+zustellen\b", r"\b\w+zulegen\b",
r"\b\w+zunehmen\b", r"\b\w+zuführen\b", r"\b\w+zuhalten\b", r"\b\w+zusetzen\b",
r"\b\w+zuweisen\b", r"\b\w+zuordnen\b", r"\b\w+zufügen\b", r"\b\w+zugeben\b",
r"\bist\b.{1,80}\bzu\s+\w+en\b", r"\bsind\b.{1,80}\bzu\s+\w+en\b",
]
PFLICHT_RE = re.compile("|".join(_PFLICHT_SIGNALS), re.IGNORECASE)
_EMPFEHLUNG_SIGNALS = [
r"\bsoll\b", r"\bsollen\b", r"\bsollte\b", r"\bsollten\b",
r"\bgewährleisten\b", r"\bsicherstellen\b",
r"\bshould\b", r"\bensure\b", r"\brecommend\w*\b",
r"\bnachweisen\b", r"\beinhalten\b", r"\bunterlassen\b", r"\bwahren\b",
r"\bdokumentieren\b", r"\bimplementieren\b", r"\büberprüfen\b", r"\büberwachen\b",
r"\bprüfen,\s+ob\b", r"\bkontrollieren,\s+ob\b",
]
EMPFEHLUNG_RE = re.compile("|".join(_EMPFEHLUNG_SIGNALS), re.IGNORECASE)
_KANN_SIGNALS = [
r"\bkann\b", r"\bkönnen\b", r"\bdarf\b", r"\bdürfen\b",
r"\bmay\b", r"\boptional\b",
]
KANN_RE = re.compile("|".join(_KANN_SIGNALS), re.IGNORECASE)
NORMATIVE_RE = re.compile(
"|".join(_PFLICHT_SIGNALS + _EMPFEHLUNG_SIGNALS + _KANN_SIGNALS),
re.IGNORECASE,
)
_RATIONALE_SIGNALS = [
r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung",
r"\bbecause\b", r"\breason\b", r"\brationale\b",
r"\bkönnen\s+.*\s+verursachen\b", r"\bführt\s+zu\b",
]
RATIONALE_RE = re.compile("|".join(_RATIONALE_SIGNALS), re.IGNORECASE)
# Evidence-related keywords (for fact detection)
_EVIDENCE_KEYWORDS = [
r"\bnachweis\b", r"\bzertifikat\b", r"\baudit.report\b",
r"\bprotokoll\b", r"\bdokumentation\b", r"\bbericht\b",
r"\bcertificate\b", r"\bevidence\b", r"\bproof\b",
]
EVIDENCE_RE = re.compile("|".join(_EVIDENCE_KEYWORDS), re.IGNORECASE)

View File

@@ -0,0 +1,563 @@
"""Obligation Extractor — 3-Tier Chunk-to-Obligation Linking.
Maps RAG chunks to obligations from the v2 obligation framework using
three tiers (fastest first):
Tier 1: EXACT MATCH — regulation_code + article → obligation_id (~40%)
Tier 2: EMBEDDING — chunk text vs. obligation descriptions (~30%)
Tier 3: LLM EXTRACT — local Ollama extracts obligation text (~25%)
Part of the Multi-Layer Control Architecture (Phase 4 of 8).
"""
import json
import logging
import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
# Embedding similarity thresholds for Tier 2
EMBEDDING_MATCH_THRESHOLD = 0.80
EMBEDDING_CANDIDATE_THRESHOLD = 0.60
# ---------------------------------------------------------------------------
# Regulation code mapping: RAG chunk codes → obligation file regulation IDs
# ---------------------------------------------------------------------------
_REGULATION_CODE_TO_ID = {
# DSGVO
"eu_2016_679": "dsgvo",
"dsgvo": "dsgvo",
"gdpr": "dsgvo",
# AI Act
"eu_2024_1689": "ai_act",
"ai_act": "ai_act",
"aiact": "ai_act",
# NIS2
"eu_2022_2555": "nis2",
"nis2": "nis2",
"bsig": "nis2",
# BDSG
"bdsg": "bdsg",
# TTDSG
"ttdsg": "ttdsg",
# DSA
"eu_2022_2065": "dsa",
"dsa": "dsa",
# Data Act
"eu_2023_2854": "data_act",
"data_act": "data_act",
# EU Machinery
"eu_2023_1230": "eu_machinery",
"eu_machinery": "eu_machinery",
# DORA
"eu_2022_2554": "dora",
"dora": "dora",
}
@dataclass
class ObligationMatch:
"""Result of obligation extraction."""
obligation_id: Optional[str] = None
obligation_title: Optional[str] = None
obligation_text: Optional[str] = None
method: str = "none" # exact_match | embedding_match | llm_extracted | inferred
confidence: float = 0.0
regulation_id: Optional[str] = None # e.g. "dsgvo"
def to_dict(self) -> dict:
return {
"obligation_id": self.obligation_id,
"obligation_title": self.obligation_title,
"obligation_text": self.obligation_text,
"method": self.method,
"confidence": self.confidence,
"regulation_id": self.regulation_id,
}
@dataclass
class _ObligationEntry:
"""Internal representation of a loaded obligation."""
id: str
title: str
description: str
regulation_id: str
articles: list[str] = field(default_factory=list) # normalized: ["art. 30", "§ 38"]
embedding: list[float] = field(default_factory=list)
class ObligationExtractor:
"""3-Tier obligation extraction from RAG chunks.
Usage::
extractor = ObligationExtractor()
await extractor.initialize() # loads obligations + embeddings
match = await extractor.extract(
chunk_text="...",
regulation_code="eu_2016_679",
article="Art. 30",
paragraph="Abs. 1",
)
"""
def __init__(self):
self._article_lookup: dict[str, list[str]] = {} # "dsgvo/art. 30" → ["DSGVO-OBL-001"]
self._obligations: dict[str, _ObligationEntry] = {} # id → entry
self._obligation_embeddings: list[list[float]] = []
self._obligation_ids: list[str] = []
self._initialized = False
async def initialize(self) -> None:
"""Load all obligations from v2 JSON files and compute embeddings."""
if self._initialized:
return
self._load_obligations()
await self._compute_embeddings()
self._initialized = True
logger.info(
"ObligationExtractor initialized: %d obligations, %d article lookups, %d embeddings",
len(self._obligations),
len(self._article_lookup),
sum(1 for e in self._obligation_embeddings if e),
)
async def extract(
self,
chunk_text: str,
regulation_code: str,
article: Optional[str] = None,
paragraph: Optional[str] = None,
) -> ObligationMatch:
"""Extract obligation from a chunk using 3-tier strategy."""
if not self._initialized:
await self.initialize()
reg_id = _normalize_regulation(regulation_code)
# Tier 1: Exact match via article lookup
if article:
match = self._tier1_exact(reg_id, article)
if match:
return match
# Tier 2: Embedding similarity
match = await self._tier2_embedding(chunk_text, reg_id)
if match:
return match
# Tier 3: LLM extraction
match = await self._tier3_llm(chunk_text, regulation_code, article)
return match
# -----------------------------------------------------------------------
# Tier 1: Exact Match
# -----------------------------------------------------------------------
def _tier1_exact(self, reg_id: Optional[str], article: str) -> Optional[ObligationMatch]:
"""Look up obligation by regulation + article."""
if not reg_id:
return None
norm_article = _normalize_article(article)
key = f"{reg_id}/{norm_article}"
obl_ids = self._article_lookup.get(key)
if not obl_ids:
return None
# Take the first match (highest priority)
obl_id = obl_ids[0]
entry = self._obligations.get(obl_id)
if not entry:
return None
return ObligationMatch(
obligation_id=entry.id,
obligation_title=entry.title,
obligation_text=entry.description,
method="exact_match",
confidence=1.0,
regulation_id=reg_id,
)
# -----------------------------------------------------------------------
# Tier 2: Embedding Match
# -----------------------------------------------------------------------
async def _tier2_embedding(
self, chunk_text: str, reg_id: Optional[str]
) -> Optional[ObligationMatch]:
"""Find nearest obligation by embedding similarity."""
if not self._obligation_embeddings:
return None
chunk_embedding = await _get_embedding(chunk_text[:2000])
if not chunk_embedding:
return None
best_idx = -1
best_score = 0.0
for i, obl_emb in enumerate(self._obligation_embeddings):
if not obl_emb:
continue
# Prefer same-regulation matches
obl_id = self._obligation_ids[i]
entry = self._obligations.get(obl_id)
score = _cosine_sim(chunk_embedding, obl_emb)
# Domain bonus: +0.05 if same regulation
if entry and reg_id and entry.regulation_id == reg_id:
score += 0.05
if score > best_score:
best_score = score
best_idx = i
if best_idx < 0:
return None
# Remove domain bonus for threshold comparison
raw_score = best_score
obl_id = self._obligation_ids[best_idx]
entry = self._obligations.get(obl_id)
if entry and reg_id and entry.regulation_id == reg_id:
raw_score -= 0.05
if raw_score >= EMBEDDING_MATCH_THRESHOLD:
return ObligationMatch(
obligation_id=entry.id if entry else obl_id,
obligation_title=entry.title if entry else None,
obligation_text=entry.description if entry else None,
method="embedding_match",
confidence=round(min(raw_score, 1.0), 3),
regulation_id=entry.regulation_id if entry else reg_id,
)
return None
# -----------------------------------------------------------------------
# Tier 3: LLM Extraction
# -----------------------------------------------------------------------
async def _tier3_llm(
self, chunk_text: str, regulation_code: str, article: Optional[str]
) -> ObligationMatch:
"""Use local LLM to extract the obligation from the chunk."""
prompt = f"""Analysiere den folgenden Gesetzestext und extrahiere die zentrale rechtliche Pflicht.
Text:
{chunk_text[:3000]}
Quelle: {regulation_code} {article or ''}
Antworte NUR als JSON:
{{
"obligation_text": "Die zentrale Pflicht in einem Satz",
"actor": "Wer muss handeln (z.B. Verantwortlicher, Auftragsverarbeiter)",
"action": "Was muss getan werden",
"normative_strength": "muss|soll|kann"
}}"""
system_prompt = (
"Du bist ein Rechtsexperte fuer EU-Datenschutz- und Digitalrecht. "
"Extrahiere die zentrale rechtliche Pflicht aus Gesetzestexten. "
"Antworte ausschliesslich als JSON."
)
result_text = await _llm_ollama(prompt, system_prompt)
if not result_text:
return ObligationMatch(
method="llm_extracted",
confidence=0.0,
regulation_id=_normalize_regulation(regulation_code),
)
parsed = _parse_json(result_text)
obligation_text = parsed.get("obligation_text", result_text[:500])
return ObligationMatch(
obligation_id=None,
obligation_title=None,
obligation_text=obligation_text,
method="llm_extracted",
confidence=0.60,
regulation_id=_normalize_regulation(regulation_code),
)
# -----------------------------------------------------------------------
# Initialization helpers
# -----------------------------------------------------------------------
def _load_obligations(self) -> None:
"""Load all obligation files from v2 framework."""
v2_dir = _find_obligations_dir()
if not v2_dir:
logger.warning("Obligations v2 directory not found — Tier 1 disabled")
return
manifest_path = v2_dir / "_manifest.json"
if not manifest_path.exists():
logger.warning("Manifest not found at %s", manifest_path)
return
with open(manifest_path) as f:
manifest = json.load(f)
for reg_info in manifest.get("regulations", []):
reg_id = reg_info["id"]
reg_file = v2_dir / reg_info["file"]
if not reg_file.exists():
logger.warning("Regulation file not found: %s", reg_file)
continue
with open(reg_file) as f:
data = json.load(f)
for obl in data.get("obligations", []):
obl_id = obl["id"]
entry = _ObligationEntry(
id=obl_id,
title=obl.get("title", ""),
description=obl.get("description", ""),
regulation_id=reg_id,
)
# Build article lookup from legal_basis
for basis in obl.get("legal_basis", []):
article_raw = basis.get("article", "")
if article_raw:
norm_art = _normalize_article(article_raw)
key = f"{reg_id}/{norm_art}"
if key not in self._article_lookup:
self._article_lookup[key] = []
self._article_lookup[key].append(obl_id)
entry.articles.append(norm_art)
self._obligations[obl_id] = entry
logger.info(
"Loaded %d obligations from %d regulations",
len(self._obligations),
len(manifest.get("regulations", [])),
)
async def _compute_embeddings(self) -> None:
"""Compute embeddings for all obligation descriptions."""
if not self._obligations:
return
self._obligation_ids = list(self._obligations.keys())
texts = [
f"{self._obligations[oid].title}: {self._obligations[oid].description}"
for oid in self._obligation_ids
]
logger.info("Computing embeddings for %d obligations...", len(texts))
self._obligation_embeddings = await _get_embeddings_batch(texts)
valid = sum(1 for e in self._obligation_embeddings if e)
logger.info("Got %d/%d valid embeddings", valid, len(texts))
# -----------------------------------------------------------------------
# Stats
# -----------------------------------------------------------------------
def stats(self) -> dict:
"""Return initialization statistics."""
return {
"total_obligations": len(self._obligations),
"article_lookups": len(self._article_lookup),
"embeddings_valid": sum(1 for e in self._obligation_embeddings if e),
"regulations": list(
{e.regulation_id for e in self._obligations.values()}
),
"initialized": self._initialized,
}
# ---------------------------------------------------------------------------
# Module-level helpers (reusable by other modules)
# ---------------------------------------------------------------------------
def _normalize_regulation(regulation_code: str) -> Optional[str]:
"""Map a RAG regulation_code to obligation framework regulation ID."""
if not regulation_code:
return None
code = regulation_code.lower().strip()
# Direct lookup
if code in _REGULATION_CODE_TO_ID:
return _REGULATION_CODE_TO_ID[code]
# Prefix matching for families
for prefix, reg_id in [
("eu_2016_679", "dsgvo"),
("eu_2024_1689", "ai_act"),
("eu_2022_2555", "nis2"),
("eu_2022_2065", "dsa"),
("eu_2023_2854", "data_act"),
("eu_2023_1230", "eu_machinery"),
("eu_2022_2554", "dora"),
]:
if code.startswith(prefix):
return reg_id
return None
def _normalize_article(article: str) -> str:
"""Normalize article references for consistent lookup.
Examples:
"Art. 30""art. 30"
"§ 38 BDSG""§ 38"
"Article 10""art. 10"
"Art. 30 Abs. 1""art. 30"
"Artikel 35""art. 35"
"""
if not article:
return ""
s = article.strip()
# Remove trailing law name: "§ 38 BDSG" → "§ 38"
s = re.sub(r"\s+(DSGVO|BDSG|TTDSG|DSA|NIS2|DORA|AI.?Act)\s*$", "", s, flags=re.IGNORECASE)
# Remove paragraph references: "Art. 30 Abs. 1" → "Art. 30"
s = re.sub(r"\s+(Abs|Absatz|para|paragraph|lit|Satz)\.?\s+.*$", "", s, flags=re.IGNORECASE)
# Normalize "Article" / "Artikel" → "Art."
s = re.sub(r"^(Article|Artikel)\s+", "Art. ", s, flags=re.IGNORECASE)
return s.lower().strip()
def _cosine_sim(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
def _find_obligations_dir() -> Optional[Path]:
"""Locate the obligations v2 directory."""
candidates = [
Path(__file__).resolve().parent.parent.parent.parent
/ "ai-compliance-sdk" / "policies" / "obligations" / "v2",
Path("/app/ai-compliance-sdk/policies/obligations/v2"),
Path("ai-compliance-sdk/policies/obligations/v2"),
]
for p in candidates:
if p.is_dir() and (p / "_manifest.json").exists():
return p
return None
async def _get_embedding(text: str) -> list[float]:
"""Get embedding vector for a single text."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": [text]},
)
resp.raise_for_status()
embeddings = resp.json().get("embeddings", [])
return embeddings[0] if embeddings else []
except Exception:
return []
async def _get_embeddings_batch(
texts: list[str], batch_size: int = 32
) -> list[list[float]]:
"""Get embeddings for multiple texts in batches."""
all_embeddings: list[list[float]] = []
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": batch},
)
resp.raise_for_status()
embeddings = resp.json().get("embeddings", [])
all_embeddings.extend(embeddings)
except Exception as e:
logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
all_embeddings.extend([[] for _ in batch])
return all_embeddings
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"""Call local Ollama for LLM extraction."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 512},
"think": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
if resp.status_code != 200:
logger.error(
"Ollama chat failed %d: %s", resp.status_code, resp.text[:300]
)
return ""
data = resp.json()
return data.get("message", {}).get("content", "")
except Exception as e:
logger.warning("Ollama call failed: %s", e)
return ""
def _parse_json(text: str) -> dict:
"""Extract JSON from LLM response text."""
# Try direct parse
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try extracting JSON block
match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return {}

View File

@@ -0,0 +1,532 @@
"""Pattern Matcher — Obligation-to-Control-Pattern Linking.
Maps obligations (from the ObligationExtractor) to control patterns
using two tiers:
Tier 1: KEYWORD MATCH — obligation_match_keywords from patterns (~70%)
Tier 2: EMBEDDING — cosine similarity with domain bonus (~25%)
Part of the Multi-Layer Control Architecture (Phase 5 of 8).
"""
import logging
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import yaml
from compliance.services.obligation_extractor import (
_cosine_sim,
_get_embedding,
_get_embeddings_batch,
)
logger = logging.getLogger(__name__)
# Minimum keyword score to accept a match (at least 2 keyword hits)
KEYWORD_MATCH_MIN_HITS = 2
# Embedding threshold for Tier 2
EMBEDDING_PATTERN_THRESHOLD = 0.75
# Domain bonus when regulation maps to the pattern's domain
DOMAIN_BONUS = 0.10
# Map regulation IDs to pattern domains that are likely relevant
_REGULATION_DOMAIN_AFFINITY = {
"dsgvo": ["DATA", "COMP", "GOV"],
"bdsg": ["DATA", "COMP"],
"ttdsg": ["DATA"],
"ai_act": ["AI", "COMP", "DATA"],
"nis2": ["SEC", "INC", "NET", "LOG", "CRYP"],
"dsa": ["DATA", "COMP"],
"data_act": ["DATA", "COMP"],
"eu_machinery": ["SEC", "COMP"],
"dora": ["SEC", "INC", "FIN", "COMP"],
}
@dataclass
class ControlPattern:
"""Python representation of a control pattern from YAML."""
id: str
name: str
name_de: str
domain: str
category: str
description: str
objective_template: str
rationale_template: str
requirements_template: list[str] = field(default_factory=list)
test_procedure_template: list[str] = field(default_factory=list)
evidence_template: list[str] = field(default_factory=list)
severity_default: str = "medium"
implementation_effort_default: str = "m"
obligation_match_keywords: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
composable_with: list[str] = field(default_factory=list)
open_anchor_refs: list[dict] = field(default_factory=list)
@dataclass
class PatternMatchResult:
"""Result of pattern matching."""
pattern: Optional[ControlPattern] = None
pattern_id: Optional[str] = None
method: str = "none" # keyword | embedding | combined | none
confidence: float = 0.0
keyword_hits: int = 0
total_keywords: int = 0
embedding_score: float = 0.0
domain_bonus_applied: bool = False
composable_patterns: list[str] = field(default_factory=list)
def to_dict(self) -> dict:
return {
"pattern_id": self.pattern_id,
"method": self.method,
"confidence": round(self.confidence, 3),
"keyword_hits": self.keyword_hits,
"total_keywords": self.total_keywords,
"embedding_score": round(self.embedding_score, 3),
"domain_bonus_applied": self.domain_bonus_applied,
"composable_patterns": self.composable_patterns,
}
class PatternMatcher:
"""Links obligations to control patterns using keyword + embedding matching.
Usage::
matcher = PatternMatcher()
await matcher.initialize()
result = await matcher.match(
obligation_text="Fuehrung eines Verarbeitungsverzeichnisses...",
regulation_id="dsgvo",
)
print(result.pattern_id) # e.g. "CP-COMP-001"
print(result.confidence) # e.g. 0.85
"""
def __init__(self):
self._patterns: list[ControlPattern] = []
self._by_id: dict[str, ControlPattern] = {}
self._by_domain: dict[str, list[ControlPattern]] = {}
self._keyword_index: dict[str, list[str]] = {} # keyword → [pattern_ids]
self._pattern_embeddings: list[list[float]] = []
self._pattern_ids: list[str] = []
self._initialized = False
async def initialize(self) -> None:
"""Load patterns from YAML and compute embeddings."""
if self._initialized:
return
self._load_patterns()
self._build_keyword_index()
await self._compute_embeddings()
self._initialized = True
logger.info(
"PatternMatcher initialized: %d patterns, %d keywords, %d embeddings",
len(self._patterns),
len(self._keyword_index),
sum(1 for e in self._pattern_embeddings if e),
)
async def match(
self,
obligation_text: str,
regulation_id: Optional[str] = None,
top_n: int = 1,
) -> PatternMatchResult:
"""Match obligation text to the best control pattern.
Args:
obligation_text: The obligation description to match against.
regulation_id: Source regulation (for domain bonus).
top_n: Number of top results to consider for composability.
Returns:
PatternMatchResult with the best match.
"""
if not self._initialized:
await self.initialize()
if not obligation_text or not self._patterns:
return PatternMatchResult()
# Tier 1: Keyword matching
keyword_result = self._tier1_keyword(obligation_text, regulation_id)
# Tier 2: Embedding matching
embedding_result = await self._tier2_embedding(obligation_text, regulation_id)
# Combine scores: prefer keyword match, boost with embedding if available
best = self._combine_results(keyword_result, embedding_result)
# Attach composable patterns
if best.pattern:
best.composable_patterns = [
pid for pid in best.pattern.composable_with
if pid in self._by_id
]
return best
async def match_top_n(
self,
obligation_text: str,
regulation_id: Optional[str] = None,
n: int = 3,
) -> list[PatternMatchResult]:
"""Return top-N pattern matches sorted by confidence descending."""
if not self._initialized:
await self.initialize()
if not obligation_text or not self._patterns:
return []
keyword_scores = self._keyword_scores(obligation_text, regulation_id)
embedding_scores = await self._embedding_scores(obligation_text, regulation_id)
# Merge scores
all_pattern_ids = set(keyword_scores.keys()) | set(embedding_scores.keys())
results: list[PatternMatchResult] = []
for pid in all_pattern_ids:
pattern = self._by_id.get(pid)
if not pattern:
continue
kw_score = keyword_scores.get(pid, (0, 0, 0.0)) # (hits, total, score)
emb_score = embedding_scores.get(pid, (0.0, False)) # (score, bonus_applied)
kw_hits, kw_total, kw_confidence = kw_score
emb_confidence, bonus_applied = emb_score
# Combined confidence: max of keyword and embedding, with boost if both
if kw_confidence > 0 and emb_confidence > 0:
combined = max(kw_confidence, emb_confidence) + 0.05
method = "combined"
elif kw_confidence > 0:
combined = kw_confidence
method = "keyword"
else:
combined = emb_confidence
method = "embedding"
results.append(PatternMatchResult(
pattern=pattern,
pattern_id=pid,
method=method,
confidence=min(combined, 1.0),
keyword_hits=kw_hits,
total_keywords=kw_total,
embedding_score=emb_confidence,
domain_bonus_applied=bonus_applied,
composable_patterns=[
p for p in pattern.composable_with if p in self._by_id
],
))
# Sort by confidence descending
results.sort(key=lambda r: r.confidence, reverse=True)
return results[:n]
# -----------------------------------------------------------------------
# Tier 1: Keyword Match
# -----------------------------------------------------------------------
def _tier1_keyword(
self, obligation_text: str, regulation_id: Optional[str]
) -> Optional[PatternMatchResult]:
"""Match by counting keyword hits in the obligation text."""
scores = self._keyword_scores(obligation_text, regulation_id)
if not scores:
return None
# Find best match
best_pid = max(scores, key=lambda pid: scores[pid][2])
hits, total, confidence = scores[best_pid]
if hits < KEYWORD_MATCH_MIN_HITS:
return None
pattern = self._by_id.get(best_pid)
if not pattern:
return None
# Check domain bonus
bonus_applied = False
if regulation_id and self._domain_matches(pattern.domain, regulation_id):
confidence = min(confidence + DOMAIN_BONUS, 1.0)
bonus_applied = True
return PatternMatchResult(
pattern=pattern,
pattern_id=best_pid,
method="keyword",
confidence=confidence,
keyword_hits=hits,
total_keywords=total,
domain_bonus_applied=bonus_applied,
)
def _keyword_scores(
self, text: str, regulation_id: Optional[str]
) -> dict[str, tuple[int, int, float]]:
"""Compute keyword match scores for all patterns.
Returns dict: pattern_id → (hits, total_keywords, confidence).
"""
text_lower = text.lower()
hits_by_pattern: dict[str, int] = {}
for keyword, pattern_ids in self._keyword_index.items():
if keyword in text_lower:
for pid in pattern_ids:
hits_by_pattern[pid] = hits_by_pattern.get(pid, 0) + 1
result: dict[str, tuple[int, int, float]] = {}
for pid, hits in hits_by_pattern.items():
pattern = self._by_id.get(pid)
if not pattern:
continue
total = len(pattern.obligation_match_keywords)
confidence = hits / total if total > 0 else 0.0
result[pid] = (hits, total, confidence)
return result
# -----------------------------------------------------------------------
# Tier 2: Embedding Match
# -----------------------------------------------------------------------
async def _tier2_embedding(
self, obligation_text: str, regulation_id: Optional[str]
) -> Optional[PatternMatchResult]:
"""Match by embedding similarity against pattern objective_templates."""
scores = await self._embedding_scores(obligation_text, regulation_id)
if not scores:
return None
best_pid = max(scores, key=lambda pid: scores[pid][0])
emb_score, bonus_applied = scores[best_pid]
if emb_score < EMBEDDING_PATTERN_THRESHOLD:
return None
pattern = self._by_id.get(best_pid)
if not pattern:
return None
return PatternMatchResult(
pattern=pattern,
pattern_id=best_pid,
method="embedding",
confidence=min(emb_score, 1.0),
embedding_score=emb_score,
domain_bonus_applied=bonus_applied,
)
async def _embedding_scores(
self, obligation_text: str, regulation_id: Optional[str]
) -> dict[str, tuple[float, bool]]:
"""Compute embedding similarity scores for all patterns.
Returns dict: pattern_id → (score, domain_bonus_applied).
"""
if not self._pattern_embeddings:
return {}
chunk_embedding = await _get_embedding(obligation_text[:2000])
if not chunk_embedding:
return {}
result: dict[str, tuple[float, bool]] = {}
for i, pat_emb in enumerate(self._pattern_embeddings):
if not pat_emb:
continue
pid = self._pattern_ids[i]
pattern = self._by_id.get(pid)
if not pattern:
continue
score = _cosine_sim(chunk_embedding, pat_emb)
# Domain bonus
bonus_applied = False
if regulation_id and self._domain_matches(pattern.domain, regulation_id):
score += DOMAIN_BONUS
bonus_applied = True
result[pid] = (score, bonus_applied)
return result
# -----------------------------------------------------------------------
# Score combination
# -----------------------------------------------------------------------
def _combine_results(
self,
keyword_result: Optional[PatternMatchResult],
embedding_result: Optional[PatternMatchResult],
) -> PatternMatchResult:
"""Combine keyword and embedding results into the best match."""
if not keyword_result and not embedding_result:
return PatternMatchResult()
if not keyword_result:
return embedding_result
if not embedding_result:
return keyword_result
# Both matched — check if they agree
if keyword_result.pattern_id == embedding_result.pattern_id:
# Same pattern: boost confidence
combined_confidence = min(
max(keyword_result.confidence, embedding_result.confidence) + 0.05,
1.0,
)
return PatternMatchResult(
pattern=keyword_result.pattern,
pattern_id=keyword_result.pattern_id,
method="combined",
confidence=combined_confidence,
keyword_hits=keyword_result.keyword_hits,
total_keywords=keyword_result.total_keywords,
embedding_score=embedding_result.embedding_score,
domain_bonus_applied=(
keyword_result.domain_bonus_applied
or embedding_result.domain_bonus_applied
),
)
# Different patterns: pick the one with higher confidence
if keyword_result.confidence >= embedding_result.confidence:
return keyword_result
return embedding_result
# -----------------------------------------------------------------------
# Domain affinity
# -----------------------------------------------------------------------
@staticmethod
def _domain_matches(pattern_domain: str, regulation_id: str) -> bool:
"""Check if a pattern's domain has affinity with a regulation."""
affine_domains = _REGULATION_DOMAIN_AFFINITY.get(regulation_id, [])
return pattern_domain in affine_domains
# -----------------------------------------------------------------------
# Initialization helpers
# -----------------------------------------------------------------------
def _load_patterns(self) -> None:
"""Load control patterns from YAML files."""
patterns_dir = _find_patterns_dir()
if not patterns_dir:
logger.warning("Control patterns directory not found")
return
for yaml_file in sorted(patterns_dir.glob("*.yaml")):
if yaml_file.name.startswith("_"):
continue
try:
with open(yaml_file) as f:
data = yaml.safe_load(f)
if not data or "patterns" not in data:
continue
for p in data["patterns"]:
pattern = ControlPattern(
id=p["id"],
name=p["name"],
name_de=p["name_de"],
domain=p["domain"],
category=p["category"],
description=p["description"],
objective_template=p["objective_template"],
rationale_template=p["rationale_template"],
requirements_template=p.get("requirements_template", []),
test_procedure_template=p.get("test_procedure_template", []),
evidence_template=p.get("evidence_template", []),
severity_default=p.get("severity_default", "medium"),
implementation_effort_default=p.get("implementation_effort_default", "m"),
obligation_match_keywords=p.get("obligation_match_keywords", []),
tags=p.get("tags", []),
composable_with=p.get("composable_with", []),
open_anchor_refs=p.get("open_anchor_refs", []),
)
self._patterns.append(pattern)
self._by_id[pattern.id] = pattern
domain_list = self._by_domain.setdefault(pattern.domain, [])
domain_list.append(pattern)
except Exception as e:
logger.error("Failed to load %s: %s", yaml_file.name, e)
logger.info("Loaded %d patterns from %s", len(self._patterns), patterns_dir)
def _build_keyword_index(self) -> None:
"""Build reverse index: keyword → [pattern_ids]."""
for pattern in self._patterns:
for kw in pattern.obligation_match_keywords:
lower_kw = kw.lower()
if lower_kw not in self._keyword_index:
self._keyword_index[lower_kw] = []
self._keyword_index[lower_kw].append(pattern.id)
async def _compute_embeddings(self) -> None:
"""Compute embeddings for all pattern objective templates."""
if not self._patterns:
return
self._pattern_ids = [p.id for p in self._patterns]
texts = [
f"{p.name_de}: {p.objective_template}"
for p in self._patterns
]
logger.info("Computing embeddings for %d patterns...", len(texts))
self._pattern_embeddings = await _get_embeddings_batch(texts)
valid = sum(1 for e in self._pattern_embeddings if e)
logger.info("Got %d/%d valid pattern embeddings", valid, len(texts))
# -----------------------------------------------------------------------
# Public helpers
# -----------------------------------------------------------------------
def get_pattern(self, pattern_id: str) -> Optional[ControlPattern]:
"""Get a pattern by its ID."""
return self._by_id.get(pattern_id.upper())
def get_patterns_by_domain(self, domain: str) -> list[ControlPattern]:
"""Get all patterns for a domain."""
return self._by_domain.get(domain.upper(), [])
def stats(self) -> dict:
"""Return matcher statistics."""
return {
"total_patterns": len(self._patterns),
"domains": list(self._by_domain.keys()),
"keywords": len(self._keyword_index),
"embeddings_valid": sum(1 for e in self._pattern_embeddings if e),
"initialized": self._initialized,
}
def _find_patterns_dir() -> Optional[Path]:
"""Locate the control_patterns directory."""
candidates = [
Path(__file__).resolve().parent.parent.parent.parent
/ "ai-compliance-sdk" / "policies" / "control_patterns",
Path("/app/ai-compliance-sdk/policies/control_patterns"),
Path("ai-compliance-sdk/policies/control_patterns"),
]
for p in candidates:
if p.is_dir():
return p
return None

View File

@@ -0,0 +1,670 @@
"""Pipeline Adapter — New 10-Stage Pipeline Integration.
Bridges the existing 7-stage control_generator pipeline with the new
multi-layer components (ObligationExtractor, PatternMatcher, ControlComposer).
New pipeline flow:
chunk → license_classify
→ obligation_extract (Stage 4 — NEW)
→ pattern_match (Stage 5 — NEW)
→ control_compose (Stage 6 — replaces old Stage 3)
→ harmonize → anchor → store + crosswalk → mark processed
Can be used in two modes:
1. INLINE: Called from _process_batch() to enrich the pipeline
2. STANDALONE: Process chunks directly through new stages
Part of the Multi-Layer Control Architecture (Phase 7 of 8).
"""
import hashlib
import json
import logging
from dataclasses import dataclass, field
from typing import Optional
from sqlalchemy import text
from sqlalchemy.orm import Session
from compliance.services.control_composer import ComposedControl, ControlComposer
from compliance.services.obligation_extractor import ObligationExtractor, ObligationMatch
from compliance.services.pattern_matcher import PatternMatcher, PatternMatchResult
logger = logging.getLogger(__name__)
@dataclass
class PipelineChunk:
"""Input chunk for the new pipeline stages."""
text: str
collection: str = ""
regulation_code: str = ""
article: Optional[str] = None
paragraph: Optional[str] = None
license_rule: int = 3
license_info: dict = field(default_factory=dict)
source_citation: Optional[dict] = None
chunk_hash: str = ""
def compute_hash(self) -> str:
if not self.chunk_hash:
self.chunk_hash = hashlib.sha256(self.text.encode()).hexdigest()
return self.chunk_hash
@dataclass
class PipelineResult:
"""Result of processing a chunk through the new pipeline."""
chunk: PipelineChunk
obligation: ObligationMatch = field(default_factory=ObligationMatch)
pattern_result: PatternMatchResult = field(default_factory=PatternMatchResult)
control: Optional[ComposedControl] = None
crosswalk_written: bool = False
error: Optional[str] = None
def to_dict(self) -> dict:
return {
"chunk_hash": self.chunk.chunk_hash,
"obligation": self.obligation.to_dict() if self.obligation else None,
"pattern": self.pattern_result.to_dict() if self.pattern_result else None,
"control": self.control.to_dict() if self.control else None,
"crosswalk_written": self.crosswalk_written,
"error": self.error,
}
class PipelineAdapter:
"""Integrates ObligationExtractor + PatternMatcher + ControlComposer.
Usage::
adapter = PipelineAdapter(db)
await adapter.initialize()
result = await adapter.process_chunk(PipelineChunk(
text="...",
regulation_code="eu_2016_679",
article="Art. 30",
license_rule=1,
))
"""
def __init__(self, db: Optional[Session] = None):
self.db = db
self._extractor = ObligationExtractor()
self._matcher = PatternMatcher()
self._composer = ControlComposer()
self._initialized = False
async def initialize(self) -> None:
"""Initialize all sub-components."""
if self._initialized:
return
await self._extractor.initialize()
await self._matcher.initialize()
self._initialized = True
logger.info("PipelineAdapter initialized")
async def process_chunk(self, chunk: PipelineChunk) -> PipelineResult:
"""Process a single chunk through the new 3-stage pipeline.
Stage 4: Obligation Extract
Stage 5: Pattern Match
Stage 6: Control Compose
"""
if not self._initialized:
await self.initialize()
chunk.compute_hash()
result = PipelineResult(chunk=chunk)
try:
# Stage 4: Obligation Extract
result.obligation = await self._extractor.extract(
chunk_text=chunk.text,
regulation_code=chunk.regulation_code,
article=chunk.article,
paragraph=chunk.paragraph,
)
# Stage 5: Pattern Match
obligation_text = (
result.obligation.obligation_text
or result.obligation.obligation_title
or chunk.text[:500]
)
result.pattern_result = await self._matcher.match(
obligation_text=obligation_text,
regulation_id=result.obligation.regulation_id,
)
# Stage 6: Control Compose
result.control = await self._composer.compose(
obligation=result.obligation,
pattern_result=result.pattern_result,
chunk_text=chunk.text if chunk.license_rule in (1, 2) else None,
license_rule=chunk.license_rule,
source_citation=chunk.source_citation,
regulation_code=chunk.regulation_code,
)
except Exception as e:
logger.error("Pipeline processing failed: %s", e)
result.error = str(e)
return result
async def process_batch(self, chunks: list[PipelineChunk]) -> list[PipelineResult]:
"""Process multiple chunks through the pipeline."""
results = []
for chunk in chunks:
result = await self.process_chunk(chunk)
results.append(result)
return results
def write_crosswalk(self, result: PipelineResult, control_uuid: str) -> bool:
"""Write obligation_extraction + crosswalk_matrix rows for a processed chunk.
Called AFTER the control is stored in canonical_controls.
"""
if not self.db or not result.control:
return False
chunk = result.chunk
obligation = result.obligation
pattern = result.pattern_result
try:
# 1. Write obligation_extraction row
self.db.execute(
text("""
INSERT INTO obligation_extractions (
chunk_hash, collection, regulation_code,
article, paragraph, obligation_id,
obligation_text, confidence, extraction_method,
pattern_id, pattern_match_score, control_uuid
) VALUES (
:chunk_hash, :collection, :regulation_code,
:article, :paragraph, :obligation_id,
:obligation_text, :confidence, :extraction_method,
:pattern_id, :pattern_match_score,
CAST(:control_uuid AS uuid)
)
"""),
{
"chunk_hash": chunk.chunk_hash,
"collection": chunk.collection,
"regulation_code": chunk.regulation_code,
"article": chunk.article,
"paragraph": chunk.paragraph,
"obligation_id": obligation.obligation_id if obligation else None,
"obligation_text": (
obligation.obligation_text[:2000]
if obligation and obligation.obligation_text
else None
),
"confidence": obligation.confidence if obligation else 0,
"extraction_method": obligation.method if obligation else "none",
"pattern_id": pattern.pattern_id if pattern else None,
"pattern_match_score": pattern.confidence if pattern else 0,
"control_uuid": control_uuid,
},
)
# 2. Write crosswalk_matrix row
self.db.execute(
text("""
INSERT INTO crosswalk_matrix (
regulation_code, article, paragraph,
obligation_id, pattern_id,
master_control_id, master_control_uuid,
confidence, source
) VALUES (
:regulation_code, :article, :paragraph,
:obligation_id, :pattern_id,
:master_control_id,
CAST(:master_control_uuid AS uuid),
:confidence, :source
)
"""),
{
"regulation_code": chunk.regulation_code,
"article": chunk.article,
"paragraph": chunk.paragraph,
"obligation_id": obligation.obligation_id if obligation else None,
"pattern_id": pattern.pattern_id if pattern else None,
"master_control_id": result.control.control_id,
"master_control_uuid": control_uuid,
"confidence": min(
obligation.confidence if obligation else 0,
pattern.confidence if pattern else 0,
),
"source": "auto",
},
)
# 3. Update canonical_controls with pattern_id + obligation_ids
if result.control.pattern_id or result.control.obligation_ids:
self.db.execute(
text("""
UPDATE canonical_controls
SET pattern_id = COALESCE(:pattern_id, pattern_id),
obligation_ids = COALESCE(:obligation_ids, obligation_ids)
WHERE id = CAST(:control_uuid AS uuid)
"""),
{
"pattern_id": result.control.pattern_id,
"obligation_ids": json.dumps(result.control.obligation_ids),
"control_uuid": control_uuid,
},
)
self.db.commit()
result.crosswalk_written = True
return True
except Exception as e:
logger.error("Failed to write crosswalk: %s", e)
self.db.rollback()
return False
def stats(self) -> dict:
"""Return component statistics."""
return {
"extractor": self._extractor.stats(),
"matcher": self._matcher.stats(),
"initialized": self._initialized,
}
# ---------------------------------------------------------------------------
# Migration Passes — Backfill existing 4,800+ controls
# ---------------------------------------------------------------------------
class MigrationPasses:
"""Non-destructive migration passes for existing controls.
Pass 1: Obligation Linkage (deterministic, article→obligation lookup)
Pass 2: Pattern Classification (keyword-based matching)
Pass 3: Quality Triage (categorize by linkage completeness)
Pass 4: Crosswalk Backfill (write crosswalk rows for linked controls)
Pass 5: Deduplication (mark duplicate controls)
Usage::
migration = MigrationPasses(db)
await migration.initialize()
result = await migration.run_pass1_obligation_linkage(limit=100)
result = await migration.run_pass2_pattern_classification(limit=100)
result = migration.run_pass3_quality_triage()
result = migration.run_pass4_crosswalk_backfill()
result = migration.run_pass5_deduplication()
"""
def __init__(self, db: Session):
self.db = db
self._extractor = ObligationExtractor()
self._matcher = PatternMatcher()
self._initialized = False
async def initialize(self) -> None:
"""Initialize extractors (loads obligations + patterns)."""
if self._initialized:
return
self._extractor._load_obligations()
self._matcher._load_patterns()
self._matcher._build_keyword_index()
self._initialized = True
# -------------------------------------------------------------------
# Pass 1: Obligation Linkage (deterministic)
# -------------------------------------------------------------------
async def run_pass1_obligation_linkage(self, limit: int = 0) -> dict:
"""Link existing controls to obligations via source_citation article.
For each control with source_citation → extract regulation + article
→ look up in obligation framework → set obligation_ids.
"""
if not self._initialized:
await self.initialize()
query = """
SELECT id, control_id, source_citation, generation_metadata
FROM canonical_controls
WHERE release_state NOT IN ('deprecated')
AND (obligation_ids IS NULL OR obligation_ids = '[]')
"""
if limit > 0:
query += f" LIMIT {limit}"
rows = self.db.execute(text(query)).fetchall()
stats = {"total": len(rows), "linked": 0, "no_match": 0, "no_citation": 0}
for row in rows:
control_uuid = str(row[0])
control_id = row[1]
citation = row[2]
metadata = row[3]
# Extract regulation + article from citation or metadata
reg_code, article = _extract_regulation_article(citation, metadata)
if not reg_code:
stats["no_citation"] += 1
continue
# Tier 1: Exact match
match = self._extractor._tier1_exact(reg_code, article or "")
if match and match.obligation_id:
self.db.execute(
text("""
UPDATE canonical_controls
SET obligation_ids = :obl_ids
WHERE id = CAST(:uuid AS uuid)
"""),
{
"obl_ids": json.dumps([match.obligation_id]),
"uuid": control_uuid,
},
)
stats["linked"] += 1
else:
stats["no_match"] += 1
self.db.commit()
logger.info("Pass 1: %s", stats)
return stats
# -------------------------------------------------------------------
# Pass 2: Pattern Classification (keyword-based)
# -------------------------------------------------------------------
async def run_pass2_pattern_classification(self, limit: int = 0) -> dict:
"""Classify existing controls into patterns via keyword matching.
For each control without pattern_id → keyword-match title+objective
against pattern library → assign best match.
"""
if not self._initialized:
await self.initialize()
query = """
SELECT id, control_id, title, objective
FROM canonical_controls
WHERE release_state NOT IN ('deprecated')
AND (pattern_id IS NULL OR pattern_id = '')
"""
if limit > 0:
query += f" LIMIT {limit}"
rows = self.db.execute(text(query)).fetchall()
stats = {"total": len(rows), "classified": 0, "no_match": 0}
for row in rows:
control_uuid = str(row[0])
title = row[2] or ""
objective = row[3] or ""
# Keyword match
match_text = f"{title} {objective}"
result = self._matcher._tier1_keyword(match_text, None)
if result and result.pattern_id and result.keyword_hits >= 2:
self.db.execute(
text("""
UPDATE canonical_controls
SET pattern_id = :pattern_id
WHERE id = CAST(:uuid AS uuid)
"""),
{
"pattern_id": result.pattern_id,
"uuid": control_uuid,
},
)
stats["classified"] += 1
else:
stats["no_match"] += 1
self.db.commit()
logger.info("Pass 2: %s", stats)
return stats
# -------------------------------------------------------------------
# Pass 3: Quality Triage
# -------------------------------------------------------------------
def run_pass3_quality_triage(self) -> dict:
"""Categorize controls by linkage completeness.
Sets generation_metadata.triage_status:
- "review": has both obligation_id + pattern_id
- "needs_obligation": has pattern_id but no obligation_id
- "needs_pattern": has obligation_id but no pattern_id
- "legacy_unlinked": has neither
"""
categories = {
"review": """
UPDATE canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{triage_status}', '"review"'
)
WHERE release_state NOT IN ('deprecated')
AND obligation_ids IS NOT NULL AND obligation_ids != '[]'
AND pattern_id IS NOT NULL AND pattern_id != ''
""",
"needs_obligation": """
UPDATE canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{triage_status}', '"needs_obligation"'
)
WHERE release_state NOT IN ('deprecated')
AND (obligation_ids IS NULL OR obligation_ids = '[]')
AND pattern_id IS NOT NULL AND pattern_id != ''
""",
"needs_pattern": """
UPDATE canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{triage_status}', '"needs_pattern"'
)
WHERE release_state NOT IN ('deprecated')
AND obligation_ids IS NOT NULL AND obligation_ids != '[]'
AND (pattern_id IS NULL OR pattern_id = '')
""",
"legacy_unlinked": """
UPDATE canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{triage_status}', '"legacy_unlinked"'
)
WHERE release_state NOT IN ('deprecated')
AND (obligation_ids IS NULL OR obligation_ids = '[]')
AND (pattern_id IS NULL OR pattern_id = '')
""",
}
stats = {}
for category, sql in categories.items():
result = self.db.execute(text(sql))
stats[category] = result.rowcount
self.db.commit()
logger.info("Pass 3: %s", stats)
return stats
# -------------------------------------------------------------------
# Pass 4: Crosswalk Backfill
# -------------------------------------------------------------------
def run_pass4_crosswalk_backfill(self) -> dict:
"""Create crosswalk_matrix rows for controls with obligation + pattern.
Only creates rows that don't already exist.
"""
result = self.db.execute(text("""
INSERT INTO crosswalk_matrix (
regulation_code, obligation_id, pattern_id,
master_control_id, master_control_uuid,
confidence, source
)
SELECT
COALESCE(
(generation_metadata::jsonb->>'source_regulation'),
''
) AS regulation_code,
obl.value::text AS obligation_id,
cc.pattern_id,
cc.control_id,
cc.id,
0.80,
'migrated'
FROM canonical_controls cc,
jsonb_array_elements_text(
COALESCE(cc.obligation_ids::jsonb, '[]'::jsonb)
) AS obl(value)
WHERE cc.release_state NOT IN ('deprecated')
AND cc.pattern_id IS NOT NULL AND cc.pattern_id != ''
AND cc.obligation_ids IS NOT NULL AND cc.obligation_ids != '[]'
AND NOT EXISTS (
SELECT 1 FROM crosswalk_matrix cw
WHERE cw.master_control_uuid = cc.id
AND cw.obligation_id = obl.value::text
)
"""))
rows_inserted = result.rowcount
self.db.commit()
logger.info("Pass 4: %d crosswalk rows inserted", rows_inserted)
return {"rows_inserted": rows_inserted}
# -------------------------------------------------------------------
# Pass 5: Deduplication
# -------------------------------------------------------------------
def run_pass5_deduplication(self) -> dict:
"""Mark duplicate controls (same obligation + same pattern).
Groups controls by (obligation_id, pattern_id), keeps the one with
highest evidence_confidence (or newest), marks rest as deprecated.
"""
# Find groups with duplicates
groups = self.db.execute(text("""
SELECT cc.pattern_id,
obl.value::text AS obligation_id,
array_agg(cc.id ORDER BY cc.evidence_confidence DESC NULLS LAST, cc.created_at DESC) AS ids,
count(*) AS cnt
FROM canonical_controls cc,
jsonb_array_elements_text(
COALESCE(cc.obligation_ids::jsonb, '[]'::jsonb)
) AS obl(value)
WHERE cc.release_state NOT IN ('deprecated')
AND cc.pattern_id IS NOT NULL AND cc.pattern_id != ''
GROUP BY cc.pattern_id, obl.value::text
HAVING count(*) > 1
""")).fetchall()
stats = {"groups_found": len(groups), "controls_deprecated": 0}
for group in groups:
ids = group[2] # Array of UUIDs, first is the keeper
if len(ids) <= 1:
continue
# Keep first (highest confidence), deprecate rest
deprecate_ids = ids[1:]
for dep_id in deprecate_ids:
self.db.execute(
text("""
UPDATE canonical_controls
SET release_state = 'deprecated',
generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{deprecated_reason}', '"duplicate_same_obligation_pattern"'
)
WHERE id = CAST(:uuid AS uuid)
AND release_state != 'deprecated'
"""),
{"uuid": str(dep_id)},
)
stats["controls_deprecated"] += 1
self.db.commit()
logger.info("Pass 5: %s", stats)
return stats
def migration_status(self) -> dict:
"""Return overall migration progress."""
row = self.db.execute(text("""
SELECT
count(*) AS total,
count(*) FILTER (WHERE obligation_ids IS NOT NULL AND obligation_ids != '[]') AS has_obligation,
count(*) FILTER (WHERE pattern_id IS NOT NULL AND pattern_id != '') AS has_pattern,
count(*) FILTER (
WHERE obligation_ids IS NOT NULL AND obligation_ids != '[]'
AND pattern_id IS NOT NULL AND pattern_id != ''
) AS fully_linked,
count(*) FILTER (WHERE release_state = 'deprecated') AS deprecated
FROM canonical_controls
""")).fetchone()
return {
"total_controls": row[0],
"has_obligation": row[1],
"has_pattern": row[2],
"fully_linked": row[3],
"deprecated": row[4],
"coverage_obligation_pct": round(row[1] / max(row[0], 1) * 100, 1),
"coverage_pattern_pct": round(row[2] / max(row[0], 1) * 100, 1),
"coverage_full_pct": round(row[3] / max(row[0], 1) * 100, 1),
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _extract_regulation_article(
citation: Optional[str], metadata: Optional[str]
) -> tuple[Optional[str], Optional[str]]:
"""Extract regulation_code and article from control's citation/metadata."""
from compliance.services.obligation_extractor import _normalize_regulation
reg_code = None
article = None
# Try citation first (JSON string or dict)
if citation:
try:
c = json.loads(citation) if isinstance(citation, str) else citation
if isinstance(c, dict):
article = c.get("article") or c.get("source_article")
# Try to get regulation from source field
source = c.get("source", "")
if source:
reg_code = _normalize_regulation(source)
except (json.JSONDecodeError, TypeError):
pass
# Try metadata
if metadata and not reg_code:
try:
m = json.loads(metadata) if isinstance(metadata, str) else metadata
if isinstance(m, dict):
src_reg = m.get("source_regulation", "")
if src_reg:
reg_code = _normalize_regulation(src_reg)
if not article:
article = m.get("source_article")
except (json.JSONDecodeError, TypeError):
pass
return reg_code, article

View File

@@ -33,6 +33,7 @@ class RAGSearchResult:
paragraph: str
source_url: str
score: float
collection: str = ""
class ComplianceRAGClient:
@@ -91,6 +92,7 @@ class ComplianceRAGClient:
paragraph=r.get("paragraph", ""),
source_url=r.get("source_url", ""),
score=r.get("score", 0.0),
collection=collection,
))
return results
@@ -98,6 +100,88 @@ class ComplianceRAGClient:
logger.warning("RAG search failed: %s", e)
return []
async def search_with_rerank(
self,
query: str,
collection: str = "bp_compliance_ce",
regulations: Optional[List[str]] = None,
top_k: int = 5,
) -> List[RAGSearchResult]:
"""
Search with optional cross-encoder re-ranking.
Fetches top_k*4 results from RAG, then re-ranks with cross-encoder
and returns top_k. Falls back to regular search if reranker is disabled.
"""
from .reranker import get_reranker
reranker = get_reranker()
if reranker is None:
return await self.search(query, collection, regulations, top_k)
# Fetch more candidates for re-ranking
candidates = await self.search(
query, collection, regulations, top_k=max(top_k * 4, 20)
)
if not candidates:
return []
texts = [c.text for c in candidates]
try:
ranked_indices = reranker.rerank(query, texts, top_k=top_k)
return [candidates[i] for i in ranked_indices]
except Exception as e:
logger.warning("Reranking failed, returning unranked: %s", e)
return candidates[:top_k]
async def scroll(
self,
collection: str,
offset: Optional[str] = None,
limit: int = 100,
) -> tuple[List[RAGSearchResult], Optional[str]]:
"""
Scroll through ALL chunks in a collection (paginated).
Returns (chunks, next_offset). next_offset is None when done.
"""
scroll_url = self._search_url.replace("/search", "/scroll")
params = {"collection": collection, "limit": str(limit)}
if offset:
params["offset"] = offset
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.get(scroll_url, params=params)
if resp.status_code != 200:
logger.warning(
"RAG scroll returned %d: %s", resp.status_code, resp.text[:200]
)
return [], None
data = resp.json()
results = []
for r in data.get("chunks", []):
results.append(RAGSearchResult(
text=r.get("text", ""),
regulation_code=r.get("regulation_code", ""),
regulation_name=r.get("regulation_name", ""),
regulation_short=r.get("regulation_short", ""),
category=r.get("category", ""),
article=r.get("article", ""),
paragraph=r.get("paragraph", ""),
source_url=r.get("source_url", ""),
score=0.0,
collection=collection,
))
next_offset = data.get("next_offset") or None
return results, next_offset
except Exception as e:
logger.warning("RAG scroll failed: %s", e)
return [], None
def format_for_prompt(
self, results: List[RAGSearchResult], max_results: int = 5
) -> str:

View File

@@ -0,0 +1,85 @@
"""
Cross-Encoder Re-Ranking for RAG Search Results.
Uses BGE Reranker v2 (BAAI/bge-reranker-v2-m3, MIT license) to re-rank
search results from Qdrant for improved retrieval quality.
Lazy-loads the model on first use. Disabled by default (RERANK_ENABLED=false).
"""
import logging
import os
from typing import Optional
logger = logging.getLogger(__name__)
RERANK_ENABLED = os.getenv("RERANK_ENABLED", "false").lower() == "true"
RERANK_MODEL = os.getenv("RERANK_MODEL", "BAAI/bge-reranker-v2-m3")
class Reranker:
"""Cross-encoder reranker using sentence-transformers."""
def __init__(self, model_name: str = RERANK_MODEL):
self._model = None # Lazy init
self._model_name = model_name
def _ensure_model(self) -> None:
"""Load model on first use."""
if self._model is not None:
return
try:
from sentence_transformers import CrossEncoder
logger.info("Loading reranker model: %s", self._model_name)
self._model = CrossEncoder(self._model_name)
logger.info("Reranker model loaded successfully")
except ImportError:
logger.error(
"sentence-transformers not installed. "
"Install with: pip install sentence-transformers"
)
raise
except Exception as e:
logger.error("Failed to load reranker model: %s", e)
raise
def rerank(
self, query: str, texts: list[str], top_k: int = 5
) -> list[int]:
"""
Return indices of top_k texts sorted by relevance (highest first).
Args:
query: The search query.
texts: List of candidate texts to re-rank.
top_k: Number of top results to return.
Returns:
List of indices into the original texts list, sorted by relevance.
"""
if not texts:
return []
self._ensure_model()
pairs = [[query, text] for text in texts]
scores = self._model.predict(pairs)
# Sort by score descending, return indices
ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
return ranked[:top_k]
# Module-level singleton
_reranker: Optional[Reranker] = None
def get_reranker() -> Optional[Reranker]:
"""Get the shared reranker instance. Returns None if disabled."""
global _reranker
if not RERANK_ENABLED:
return None
if _reranker is None:
_reranker = Reranker()
return _reranker

View File

@@ -0,0 +1,331 @@
"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations.
Finds regulatory coverage for v1 controls (generation_strategy='ungrouped',
pipeline_version=1, no source_citation) by embedding similarity search.
Reuses embedding + Qdrant helpers from control_dedup.py.
"""
import logging
from typing import Optional
from sqlalchemy import text
from database import SessionLocal
from compliance.services.control_dedup import (
get_embedding,
qdrant_search_cross_regulation,
)
logger = logging.getLogger(__name__)
# Similarity threshold — lower than dedup (0.85) since we want informational matches
# Typical top scores for v1 controls are 0.70-0.77
V1_MATCH_THRESHOLD = 0.70
V1_MAX_MATCHES = 5
def _is_eigenentwicklung_query() -> str:
"""SQL WHERE clause identifying v1 Eigenentwicklung controls."""
return """
generation_strategy = 'ungrouped'
AND (pipeline_version = '1' OR pipeline_version IS NULL)
AND source_citation IS NULL
AND parent_control_uuid IS NULL
AND release_state NOT IN ('rejected', 'merged', 'deprecated')
"""
async def count_v1_controls() -> int:
"""Count how many v1 Eigenentwicklung controls exist."""
with SessionLocal() as db:
row = db.execute(text(f"""
SELECT COUNT(*) AS cnt
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
return row.cnt if row else 0
async def enrich_v1_matches(
dry_run: bool = True,
batch_size: int = 100,
offset: int = 0,
) -> dict:
"""Find regulatory matches for v1 Eigenentwicklung controls.
Args:
dry_run: If True, only count — don't write matches.
batch_size: Number of v1 controls to process per call.
offset: Pagination offset (v1 control index).
Returns:
Stats dict with counts, sample matches, and pagination info.
"""
with SessionLocal() as db:
# 1. Load v1 controls (paginated)
v1_controls = db.execute(text(f"""
SELECT id, control_id, title, objective, category
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
ORDER BY control_id
LIMIT :limit OFFSET :offset
"""), {"limit": batch_size, "offset": offset}).fetchall()
# Count total for pagination
total_row = db.execute(text(f"""
SELECT COUNT(*) AS cnt
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
total_v1 = total_row.cnt if total_row else 0
if not v1_controls:
return {
"dry_run": dry_run,
"processed": 0,
"total_v1": total_v1,
"message": "Kein weiterer Batch — alle v1 Controls verarbeitet.",
}
if dry_run:
return {
"dry_run": True,
"total_v1": total_v1,
"offset": offset,
"batch_size": batch_size,
"sample_controls": [
{
"control_id": r.control_id,
"title": r.title,
"category": r.category,
}
for r in v1_controls[:20]
],
}
# 2. Process each v1 control
processed = 0
matches_inserted = 0
errors = []
sample_matches = []
for v1 in v1_controls:
try:
# Build search text
search_text = f"{v1.title}{v1.objective}"
# Get embedding
embedding = await get_embedding(search_text)
if not embedding:
errors.append({
"control_id": v1.control_id,
"error": "Embedding fehlgeschlagen",
})
continue
# Search Qdrant (cross-regulation, no pattern filter)
# Collection is atomic_controls_dedup (contains ~51k atomare Controls)
results = await qdrant_search_cross_regulation(
embedding, top_k=20,
collection="atomic_controls_dedup",
)
# For each hit: resolve to a regulatory parent with source_citation.
# Atomic controls in Qdrant usually have parent_control_uuid → parent
# has the source_citation. We deduplicate by parent to avoid
# listing the same regulation multiple times.
rank = 0
seen_parents: set[str] = set()
for hit in results:
score = hit.get("score", 0)
if score < V1_MATCH_THRESHOLD:
continue
payload = hit.get("payload", {})
matched_uuid = payload.get("control_uuid")
if not matched_uuid or matched_uuid == str(v1.id):
continue
# Try the matched control itself first, then its parent
matched_row = db.execute(text("""
SELECT c.id, c.control_id, c.title, c.source_citation,
c.severity, c.category, c.parent_control_uuid
FROM canonical_controls c
WHERE c.id = CAST(:uuid AS uuid)
"""), {"uuid": matched_uuid}).fetchone()
if not matched_row:
continue
# Resolve to regulatory control (one with source_citation)
reg_row = matched_row
if not reg_row.source_citation and reg_row.parent_control_uuid:
# Look up parent — the parent has the source_citation
parent_row = db.execute(text("""
SELECT id, control_id, title, source_citation,
severity, category, parent_control_uuid
FROM canonical_controls
WHERE id = CAST(:uuid AS uuid)
AND source_citation IS NOT NULL
"""), {"uuid": str(reg_row.parent_control_uuid)}).fetchone()
if parent_row:
reg_row = parent_row
if not reg_row.source_citation:
continue
# Deduplicate by parent UUID
parent_key = str(reg_row.id)
if parent_key in seen_parents:
continue
seen_parents.add(parent_key)
rank += 1
if rank > V1_MAX_MATCHES:
break
# Extract source info
source_citation = reg_row.source_citation or {}
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
# Insert match — link to the regulatory parent (not the atomic child)
db.execute(text("""
INSERT INTO v1_control_matches
(v1_control_uuid, matched_control_uuid, similarity_score,
match_rank, matched_source, matched_article, match_method)
VALUES
(CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score,
:rank, :source, :article, 'embedding')
ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE
SET similarity_score = EXCLUDED.similarity_score,
match_rank = EXCLUDED.match_rank
"""), {
"v1_uuid": str(v1.id),
"matched_uuid": str(reg_row.id),
"score": round(score, 3),
"rank": rank,
"source": matched_source,
"article": matched_article,
})
matches_inserted += 1
# Collect sample
if len(sample_matches) < 20:
sample_matches.append({
"v1_control_id": v1.control_id,
"v1_title": v1.title,
"matched_control_id": reg_row.control_id,
"matched_title": reg_row.title,
"matched_source": matched_source,
"matched_article": matched_article,
"similarity_score": round(score, 3),
"match_rank": rank,
})
processed += 1
except Exception as e:
logger.warning("V1 enrichment error for %s: %s", v1.control_id, e)
errors.append({
"control_id": v1.control_id,
"error": str(e),
})
db.commit()
# Pagination
next_offset = offset + batch_size if len(v1_controls) == batch_size else None
return {
"dry_run": False,
"offset": offset,
"batch_size": batch_size,
"next_offset": next_offset,
"total_v1": total_v1,
"processed": processed,
"matches_inserted": matches_inserted,
"errors": errors[:10],
"sample_matches": sample_matches,
}
async def get_v1_matches(control_uuid: str) -> list[dict]:
"""Get all regulatory matches for a specific v1 control.
Args:
control_uuid: The UUID of the v1 control.
Returns:
List of match dicts with control details.
"""
with SessionLocal() as db:
rows = db.execute(text("""
SELECT
m.similarity_score,
m.match_rank,
m.matched_source,
m.matched_article,
m.match_method,
c.control_id AS matched_control_id,
c.title AS matched_title,
c.objective AS matched_objective,
c.severity AS matched_severity,
c.category AS matched_category,
c.source_citation AS matched_source_citation
FROM v1_control_matches m
JOIN canonical_controls c ON c.id = m.matched_control_uuid
WHERE m.v1_control_uuid = CAST(:uuid AS uuid)
ORDER BY m.match_rank
"""), {"uuid": control_uuid}).fetchall()
return [
{
"matched_control_id": r.matched_control_id,
"matched_title": r.matched_title,
"matched_objective": r.matched_objective,
"matched_severity": r.matched_severity,
"matched_category": r.matched_category,
"matched_source": r.matched_source,
"matched_article": r.matched_article,
"matched_source_citation": r.matched_source_citation,
"similarity_score": float(r.similarity_score),
"match_rank": r.match_rank,
"match_method": r.match_method,
}
for r in rows
]
async def get_v1_enrichment_stats() -> dict:
"""Get overview stats for v1 enrichment."""
with SessionLocal() as db:
total_v1 = db.execute(text(f"""
SELECT COUNT(*) AS cnt FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
matched_v1 = db.execute(text(f"""
SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt
FROM v1_control_matches m
JOIN canonical_controls c ON c.id = m.v1_control_uuid
WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')}
""")).fetchone()
total_matches = db.execute(text("""
SELECT COUNT(*) AS cnt FROM v1_control_matches
""")).fetchone()
avg_score = db.execute(text("""
SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches
""")).fetchone()
return {
"total_v1_controls": total_v1.cnt if total_v1 else 0,
"v1_with_matches": matched_v1.cnt if matched_v1 else 0,
"v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0),
"total_matches": total_matches.cnt if total_matches else 0,
"avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None,
}