merge: sync with origin/main, take upstream on conflicts

# Conflicts:
#	admin-compliance/lib/sdk/types.ts
#	admin-compliance/lib/sdk/vendor-compliance/types.ts
This commit is contained in:
Sharang Parnerkar
2026-04-16 16:26:48 +02:00
352 changed files with 181673 additions and 2188 deletions
+4 -2
View File
@@ -10,13 +10,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first for better caching
COPY requirements.txt .
COPY requirements.txt requirements-reranker.txt ./
# Create virtual environment and install dependencies
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir -r requirements-reranker.txt || \
echo "WARNING: reranker dependencies not installed (torch/sentence-transformers)"
# ---- Runtime stage ----
FROM python:3.12-slim-bookworm
@@ -6,6 +6,8 @@ from .routes import router
logger = logging.getLogger(__name__)
_failed_routers: dict[str, str] = {}
def _safe_import_router(module_name: str, attr: str = "router"):
"""Import a router module safely — log error but don't crash the whole app."""
@@ -14,6 +16,7 @@ def _safe_import_router(module_name: str, attr: str = "router"):
return getattr(mod, attr)
except Exception as e:
logger.error("Failed to import %s: %s", module_name, e)
_failed_routers[module_name] = str(e)
return None
@@ -53,6 +56,13 @@ _ROUTER_MODULES = [
"wiki_routes",
"canonical_control_routes",
"control_generator_routes",
"crosswalk_routes",
"process_task_routes",
"evidence_check_routes",
"vvt_library_routes",
"tom_mapping_routes",
"llm_audit_routes",
"assertion_routes",
]
_loaded_count = 0
@@ -0,0 +1,227 @@
"""
API routes for Assertion Engine (Anti-Fake-Evidence Phase 2).
Endpoints:
- /assertions: CRUD for assertions
- /assertions/extract: Automatic extraction from entity text
- /assertions/summary: Stats (total assertions, facts, unverified)
"""
import logging
from datetime import datetime
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from ..db.models import AssertionDB
from ..services.assertion_engine import extract_assertions
from .schemas import (
AssertionCreate,
AssertionUpdate,
AssertionResponse,
AssertionListResponse,
AssertionSummaryResponse,
AssertionExtractRequest,
)
from .audit_trail_utils import log_audit_trail, generate_id
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-assertions"])
def _build_assertion_response(a: AssertionDB) -> AssertionResponse:
return AssertionResponse(
id=a.id,
tenant_id=a.tenant_id,
entity_type=a.entity_type,
entity_id=a.entity_id,
sentence_text=a.sentence_text,
sentence_index=a.sentence_index,
assertion_type=a.assertion_type,
evidence_ids=a.evidence_ids or [],
confidence=a.confidence or 0.0,
normative_tier=a.normative_tier,
verified_by=a.verified_by,
verified_at=a.verified_at,
created_at=a.created_at,
updated_at=a.updated_at,
)
@router.post("/assertions", response_model=AssertionResponse)
async def create_assertion(
data: AssertionCreate,
tenant_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
"""Create a single assertion manually."""
a = AssertionDB(
id=generate_id(),
tenant_id=tenant_id,
entity_type=data.entity_type,
entity_id=data.entity_id,
sentence_text=data.sentence_text,
assertion_type=data.assertion_type or "assertion",
evidence_ids=data.evidence_ids or [],
normative_tier=data.normative_tier,
)
db.add(a)
db.commit()
db.refresh(a)
return _build_assertion_response(a)
@router.get("/assertions", response_model=AssertionListResponse)
async def list_assertions(
entity_type: Optional[str] = Query(None),
entity_id: Optional[str] = Query(None),
assertion_type: Optional[str] = Query(None),
tenant_id: Optional[str] = Query(None),
limit: int = Query(100, ge=1, le=500),
db: Session = Depends(get_db),
):
"""List assertions with optional filters."""
query = db.query(AssertionDB)
if entity_type:
query = query.filter(AssertionDB.entity_type == entity_type)
if entity_id:
query = query.filter(AssertionDB.entity_id == entity_id)
if assertion_type:
query = query.filter(AssertionDB.assertion_type == assertion_type)
if tenant_id:
query = query.filter(AssertionDB.tenant_id == tenant_id)
total = query.count()
records = query.order_by(AssertionDB.sentence_index.asc()).limit(limit).all()
return AssertionListResponse(
assertions=[_build_assertion_response(a) for a in records],
total=total,
)
@router.get("/assertions/summary", response_model=AssertionSummaryResponse)
async def assertion_summary(
tenant_id: Optional[str] = Query(None),
entity_type: Optional[str] = Query(None),
entity_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
"""Summary stats: total assertions, facts, rationale, unverified."""
query = db.query(AssertionDB)
if tenant_id:
query = query.filter(AssertionDB.tenant_id == tenant_id)
if entity_type:
query = query.filter(AssertionDB.entity_type == entity_type)
if entity_id:
query = query.filter(AssertionDB.entity_id == entity_id)
all_records = query.all()
total = len(all_records)
facts = sum(1 for a in all_records if a.assertion_type == "fact")
rationale = sum(1 for a in all_records if a.assertion_type == "rationale")
unverified = sum(1 for a in all_records if a.assertion_type == "assertion" and not a.verified_by)
return AssertionSummaryResponse(
total_assertions=total,
total_facts=facts,
total_rationale=rationale,
unverified_count=unverified,
)
@router.get("/assertions/{assertion_id}", response_model=AssertionResponse)
async def get_assertion(
assertion_id: str,
db: Session = Depends(get_db),
):
"""Get a single assertion by ID."""
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
if not a:
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
return _build_assertion_response(a)
@router.put("/assertions/{assertion_id}", response_model=AssertionResponse)
async def update_assertion(
assertion_id: str,
data: AssertionUpdate,
db: Session = Depends(get_db),
):
"""Update an assertion (e.g. link evidence, change type)."""
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
if not a:
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
update_fields = data.model_dump(exclude_unset=True)
for key, value in update_fields.items():
setattr(a, key, value)
a.updated_at = datetime.utcnow()
db.commit()
db.refresh(a)
return _build_assertion_response(a)
@router.post("/assertions/{assertion_id}/verify", response_model=AssertionResponse)
async def verify_assertion(
assertion_id: str,
verified_by: str = Query(...),
db: Session = Depends(get_db),
):
"""Mark an assertion as verified fact."""
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
if not a:
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
a.assertion_type = "fact"
a.verified_by = verified_by
a.verified_at = datetime.utcnow()
a.updated_at = datetime.utcnow()
db.commit()
db.refresh(a)
return _build_assertion_response(a)
@router.post("/assertions/extract", response_model=AssertionListResponse)
async def extract_assertions_endpoint(
data: AssertionExtractRequest,
tenant_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
"""Extract assertions from free text and persist them."""
extracted = extract_assertions(
text=data.text,
entity_type=data.entity_type,
entity_id=data.entity_id,
tenant_id=tenant_id,
)
created = []
for item in extracted:
a = AssertionDB(
id=generate_id(),
tenant_id=item["tenant_id"],
entity_type=item["entity_type"],
entity_id=item["entity_id"],
sentence_text=item["sentence_text"],
sentence_index=item["sentence_index"],
assertion_type=item["assertion_type"],
evidence_ids=item["evidence_ids"],
normative_tier=item.get("normative_tier"),
confidence=item.get("confidence", 0.0),
)
db.add(a)
created.append(a)
db.commit()
for a in created:
db.refresh(a)
return AssertionListResponse(
assertions=[_build_assertion_response(a) for a in created],
total=len(created),
)
@@ -0,0 +1,53 @@
"""Shared audit trail utilities.
Extracted from isms_routes.py for reuse across evidence, control,
and assertion routes.
"""
import hashlib
import uuid
from datetime import datetime
from sqlalchemy.orm import Session
from ..db.models import AuditTrailDB
def generate_id() -> str:
"""Generate a UUID string."""
return str(uuid.uuid4())
def create_signature(data: str) -> str:
"""Create SHA-256 signature."""
return hashlib.sha256(data.encode()).hexdigest()
def log_audit_trail(
db: Session,
entity_type: str,
entity_id: str,
entity_name: str,
action: str,
performed_by: str,
field_changed: str = None,
old_value: str = None,
new_value: str = None,
change_summary: str = None,
):
"""Log an entry to the audit trail."""
trail = AuditTrailDB(
id=generate_id(),
entity_type=entity_type,
entity_id=entity_id,
entity_name=entity_name,
action=action,
field_changed=field_changed,
old_value=old_value,
new_value=new_value,
change_summary=change_summary,
performed_by=performed_by,
performed_at=datetime.utcnow(),
checksum=create_signature(f"{entity_type}|{entity_id}|{action}|{performed_by}"),
)
db.add(trail)
File diff suppressed because it is too large Load Diff
@@ -12,6 +12,7 @@ Endpoints:
POST /v1/canonical/blocked-sources/cleanup — Start cleanup workflow
"""
import asyncio
import json
import logging
from typing import Optional, List
@@ -25,7 +26,16 @@ from compliance.services.control_generator import (
ControlGeneratorPipeline,
GeneratorConfig,
ALL_COLLECTIONS,
VALID_CATEGORIES,
VALID_DOMAINS,
_classify_regulation,
_detect_category,
_detect_domain,
_llm_local,
_parse_llm_json,
CATEGORY_LIST_STR,
)
from compliance.services.citation_backfill import CitationBackfill, BackfillResult
from compliance.services.rag_client import get_rag_client
logger = logging.getLogger(__name__)
@@ -40,9 +50,12 @@ class GenerateRequest(BaseModel):
domain: Optional[str] = None
collections: Optional[List[str]] = None
max_controls: int = 50
max_chunks: int = 1000 # Default: process max 1000 chunks per job (respects document boundaries)
batch_size: int = 5
skip_web_search: bool = False
dry_run: bool = False
regulation_filter: Optional[List[str]] = None # Only process these regulation_code prefixes
skip_prefilter: bool = False # Skip local LLM pre-filter, send all chunks to API
class GenerateResponse(BaseModel):
@@ -55,6 +68,7 @@ class GenerateResponse(BaseModel):
controls_needs_review: int = 0
controls_too_close: int = 0
controls_duplicates_found: int = 0
controls_qa_fixed: int = 0
errors: list = []
controls: list = []
@@ -89,42 +103,111 @@ class BlockedSourceResponse(BaseModel):
# ENDPOINTS
# =============================================================================
async def _run_pipeline_background(config: GeneratorConfig, job_id: str):
"""Run the pipeline in the background. Uses its own DB session."""
db = SessionLocal()
try:
config.existing_job_id = job_id
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
result = await pipeline.run(config)
logger.info(
"Background generation job %s completed: %d controls from %d chunks",
job_id, result.controls_generated, result.total_chunks_scanned,
)
except Exception as e:
logger.error("Background generation job %s failed: %s", job_id, e)
# Update job as failed
try:
db.execute(
text("""
UPDATE canonical_generation_jobs
SET status = 'failed', errors = :errors, completed_at = NOW()
WHERE id = CAST(:job_id AS uuid)
"""),
{"job_id": job_id, "errors": json.dumps([str(e)])},
)
db.commit()
except Exception:
pass
finally:
db.close()
@router.post("/generate", response_model=GenerateResponse)
async def start_generation(req: GenerateRequest):
"""Start a control generation run."""
"""Start a control generation run (runs in background).
Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress.
"""
config = GeneratorConfig(
collections=req.collections,
domain=req.domain,
batch_size=req.batch_size,
max_controls=req.max_controls,
max_chunks=req.max_chunks,
skip_web_search=req.skip_web_search,
dry_run=req.dry_run,
regulation_filter=req.regulation_filter,
skip_prefilter=req.skip_prefilter,
)
if req.dry_run:
# Dry run: execute synchronously and return controls
db = SessionLocal()
try:
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
result = await pipeline.run(config)
return GenerateResponse(
job_id=result.job_id,
status=result.status,
message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
total_chunks_scanned=result.total_chunks_scanned,
controls_generated=result.controls_generated,
controls_verified=result.controls_verified,
controls_needs_review=result.controls_needs_review,
controls_too_close=result.controls_too_close,
controls_duplicates_found=result.controls_duplicates_found,
errors=result.errors,
controls=result.controls,
)
except Exception as e:
logger.error("Dry run failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
# Create job record first so we can return the ID
db = SessionLocal()
try:
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
result = await pipeline.run(config)
return GenerateResponse(
job_id=result.job_id,
status=result.status,
message=f"Generated {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
total_chunks_scanned=result.total_chunks_scanned,
controls_generated=result.controls_generated,
controls_verified=result.controls_verified,
controls_needs_review=result.controls_needs_review,
controls_too_close=result.controls_too_close,
controls_duplicates_found=result.controls_duplicates_found,
errors=result.errors,
controls=result.controls if req.dry_run else [],
result = db.execute(
text("""
INSERT INTO canonical_generation_jobs (status, config)
VALUES ('running', :config)
RETURNING id
"""),
{"config": json.dumps(config.model_dump())},
)
db.commit()
row = result.fetchone()
job_id = str(row[0]) if row else None
except Exception as e:
logger.error("Generation failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
logger.error("Failed to create job: %s", e)
raise HTTPException(status_code=500, detail=f"Failed to create job: {e}")
finally:
db.close()
if not job_id:
raise HTTPException(status_code=500, detail="Failed to create job record")
# Launch pipeline in background
asyncio.create_task(_run_pipeline_background(config, job_id))
return GenerateResponse(
job_id=job_id,
status="running",
message="Generation started in background. Poll /generate/status/{job_id} for progress.",
)
@router.get("/generate/status/{job_id}")
async def get_job_status(job_id: str):
@@ -132,7 +215,7 @@ async def get_job_status(job_id: str):
db = SessionLocal()
try:
result = db.execute(
text("SELECT * FROM canonical_generation_jobs WHERE id = :id::uuid"),
text("SELECT * FROM canonical_generation_jobs WHERE id = CAST(:id AS uuid)"),
{"id": job_id},
)
row = result.fetchone()
@@ -270,6 +353,188 @@ async def review_control(control_id: str, req: ReviewRequest):
db.close()
class BulkReviewRequest(BaseModel):
release_state: str # Filter: which controls to bulk-review
action: str # "approve" or "reject"
new_state: Optional[str] = None # Override target state
@router.post("/generate/bulk-review")
async def bulk_review(req: BulkReviewRequest):
"""Bulk review all controls matching a release_state filter.
Example: reject all needs_review → sets them to deprecated.
"""
if req.release_state not in ("needs_review", "too_close", "duplicate"):
raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
if req.action == "approve":
target = req.new_state or "draft"
elif req.action == "reject":
target = "deprecated"
else:
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
db = SessionLocal()
try:
result = db.execute(
text("""
UPDATE canonical_controls
SET release_state = :target, updated_at = NOW()
WHERE release_state = :source
RETURNING control_id
"""),
{"source": req.release_state, "target": target},
)
affected = [row[0] for row in result]
db.commit()
return {
"action": req.action,
"source_state": req.release_state,
"target_state": target,
"affected_count": len(affected),
}
finally:
db.close()
class QAReclassifyRequest(BaseModel):
limit: int = 100 # How many controls to reclassify per run
dry_run: bool = True # Preview only by default
filter_category: Optional[str] = None # Only reclassify controls of this category
filter_domain_prefix: Optional[str] = None # Only reclassify controls with this prefix
@router.post("/generate/qa-reclassify")
async def qa_reclassify(req: QAReclassifyRequest):
"""Run QA reclassification on existing controls using local LLM.
Finds controls where keyword-detection disagrees with current category/domain,
then uses Ollama to determine the correct classification.
"""
db = SessionLocal()
try:
# Load controls to check
where_clauses = ["release_state NOT IN ('deprecated')"]
params = {"limit": req.limit}
if req.filter_category:
where_clauses.append("category = :cat")
params["cat"] = req.filter_category
if req.filter_domain_prefix:
where_clauses.append("control_id LIKE :prefix")
params["prefix"] = f"{req.filter_domain_prefix}-%"
where_sql = " AND ".join(where_clauses)
rows = db.execute(
text(f"""
SELECT id, control_id, title, objective, category,
COALESCE(requirements::text, '[]') as requirements,
COALESCE(source_original_text, '') as source_text
FROM canonical_controls
WHERE {where_sql}
ORDER BY created_at DESC
LIMIT :limit
"""),
params,
).fetchall()
results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
for row in rows:
results["checked"] += 1
control_id = row[1]
title = row[2]
objective = row[3] or ""
current_category = row[4]
source_text = row[6] or objective
# Keyword detection on source text
kw_category = _detect_category(source_text) or _detect_category(objective)
kw_domain = _detect_domain(source_text)
current_prefix = control_id.split("-")[0] if "-" in control_id else ""
# Skip if keyword detection agrees with current classification
if kw_category == current_category and kw_domain == current_prefix:
continue
results["mismatches"] += 1
# Ask Ollama to arbitrate
try:
reqs_text = ""
try:
reqs = json.loads(row[5])
if isinstance(reqs, list):
reqs_text = ", ".join(str(r) for r in reqs[:3])
except Exception:
pass
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
Titel: {title[:100]}
Ziel: {objective[:200]}
Anforderungen: {reqs_text[:200]}
Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
Kategorien: {CATEGORY_LIST_STR}"""
raw = await _llm_local(prompt)
data = _parse_llm_json(raw)
if not data:
continue
qa_domain = data.get("domain", "").upper()
qa_category = data.get("category", "")
reason = data.get("reason", "")
fix_entry = {
"control_id": control_id,
"title": title[:80],
"old_category": current_category,
"old_domain": current_prefix,
"new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
"new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
"reason": reason,
}
category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
if category_changed and not req.dry_run:
db.execute(
text("""
UPDATE canonical_controls
SET category = :category, updated_at = NOW()
WHERE id = :id
"""),
{"id": row[0], "category": qa_category},
)
fix_entry["applied"] = True
else:
fix_entry["applied"] = False
results["fixes"].append(fix_entry)
except Exception as e:
results["errors"].append({"control_id": control_id, "error": str(e)})
if not req.dry_run:
db.commit()
return results
finally:
db.close()
@router.get("/generate/processed-stats")
async def get_processed_stats():
"""Get processing statistics per collection."""
@@ -429,3 +694,407 @@ async def get_controls_customer_view(
return {"controls": controls, "total": len(controls)}
finally:
db.close()
# =============================================================================
# CITATION BACKFILL
# =============================================================================
class BackfillRequest(BaseModel):
dry_run: bool = True # Default to dry_run for safety
limit: int = 0 # 0 = all controls
class BackfillResponse(BaseModel):
status: str
total_controls: int = 0
matched_hash: int = 0
matched_regex: int = 0
matched_llm: int = 0
unmatched: int = 0
updated: int = 0
errors: list = []
_backfill_status: dict = {}
async def _run_backfill_background(dry_run: bool, limit: int, backfill_id: str):
"""Run backfill in background with own DB session."""
db = SessionLocal()
try:
backfill = CitationBackfill(db=db, rag_client=get_rag_client())
result = await backfill.run(dry_run=dry_run, limit=limit)
_backfill_status[backfill_id] = {
"status": "completed",
"total_controls": result.total_controls,
"matched_hash": result.matched_hash,
"matched_regex": result.matched_regex,
"matched_llm": result.matched_llm,
"unmatched": result.unmatched,
"updated": result.updated,
"errors": result.errors[:50],
}
logger.info("Backfill %s completed: %d updated", backfill_id, result.updated)
except Exception as e:
logger.error("Backfill %s failed: %s", backfill_id, e)
_backfill_status[backfill_id] = {"status": "failed", "errors": [str(e)]}
finally:
db.close()
@router.post("/generate/backfill-citations", response_model=BackfillResponse)
async def start_backfill(req: BackfillRequest):
"""Backfill article/paragraph into existing control source_citations.
Uses 3-tier matching: hash lookup → regex parse → Ollama LLM.
Default is dry_run=True (preview only, no DB changes).
"""
import uuid
backfill_id = str(uuid.uuid4())[:8]
_backfill_status[backfill_id] = {"status": "running"}
# Always run in background (RAG index build takes minutes)
asyncio.create_task(_run_backfill_background(req.dry_run, req.limit, backfill_id))
return BackfillResponse(
status=f"running (id={backfill_id})",
)
@router.get("/generate/backfill-status/{backfill_id}")
async def get_backfill_status(backfill_id: str):
"""Get status of a backfill job."""
status = _backfill_status.get(backfill_id)
if not status:
raise HTTPException(status_code=404, detail="Backfill job not found")
return status
# =============================================================================
# DOMAIN + TARGET AUDIENCE BACKFILL
# =============================================================================
class DomainBackfillRequest(BaseModel):
dry_run: bool = True
job_id: Optional[str] = None # Only backfill controls from this job
limit: int = 0 # 0 = all
_domain_backfill_status: dict = {}
async def _run_domain_backfill(req: DomainBackfillRequest, backfill_id: str):
"""Backfill domain, category, and target_audience for existing controls using Anthropic."""
import os
import httpx
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
if not ANTHROPIC_API_KEY:
_domain_backfill_status[backfill_id] = {
"status": "failed", "error": "ANTHROPIC_API_KEY not set"
}
return
db = SessionLocal()
try:
# Find controls needing backfill
where_clauses = ["(target_audience IS NULL OR target_audience = '[]' OR target_audience = 'null')"]
params: dict = {}
if req.job_id:
where_clauses.append("generation_metadata->>'job_id' = :job_id")
params["job_id"] = req.job_id
query = f"""
SELECT id, control_id, title, objective, category, source_original_text, tags
FROM canonical_controls
WHERE {' AND '.join(where_clauses)}
ORDER BY control_id
"""
if req.limit > 0:
query += f" LIMIT {req.limit}"
result = db.execute(text(query), params)
controls = [dict(zip(result.keys(), row)) for row in result]
total = len(controls)
updated = 0
errors = []
_domain_backfill_status[backfill_id] = {
"status": "running", "total": total, "updated": 0, "errors": []
}
# Process in batches of 10
BATCH_SIZE = 10
for batch_start in range(0, total, BATCH_SIZE):
batch = controls[batch_start:batch_start + BATCH_SIZE]
entries = []
for idx, ctrl in enumerate(batch):
text_for_analysis = ctrl.get("objective") or ctrl.get("title") or ""
original = ctrl.get("source_original_text") or ""
if original:
text_for_analysis += f"\n\nQuelltext-Auszug: {original[:500]}"
entries.append(
f"--- CONTROL {idx + 1}: {ctrl['control_id']} ---\n"
f"Titel: {ctrl.get('title', '')}\n"
f"Objective: {text_for_analysis[:800]}\n"
f"Tags: {json.dumps(ctrl.get('tags', []))}"
)
prompt = f"""Analysiere die folgenden {len(batch)} Controls und bestimme fuer jedes:
1. domain: Das Fachgebiet (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
2. category: Die Kategorie (encryption, authentication, network, data_protection, logging, incident, continuity, compliance, supply_chain, physical, personnel, application, system, risk, governance, hardware, identity, public_administration, labor_law, finance, trade_regulation, environmental, health)
3. target_audience: Liste der Zielgruppen (moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
Antworte mit einem JSON-Array mit {len(batch)} Objekten. Jedes Objekt hat:
- control_index: 1-basierter Index
- domain: Fachgebiet-Kuerzel
- category: Kategorie
- target_audience: Liste der Zielgruppen
{"".join(entries)}"""
try:
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Fachgebiet und Zielgruppe. Antworte NUR mit validem JSON.",
"messages": [{"role": "user", "content": prompt}],
}
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.post(
"https://api.anthropic.com/v1/messages",
headers=headers,
json=payload,
)
if resp.status_code != 200:
errors.append(f"Anthropic API {resp.status_code} at batch {batch_start}")
continue
raw = resp.json().get("content", [{}])[0].get("text", "")
# Parse response
import re
bracket_match = re.search(r"\[.*\]", raw, re.DOTALL)
if not bracket_match:
errors.append(f"No JSON array in response at batch {batch_start}")
continue
results_list = json.loads(bracket_match.group(0))
for item in results_list:
idx = item.get("control_index", 0) - 1
if idx < 0 or idx >= len(batch):
continue
ctrl = batch[idx]
ctrl_id = str(ctrl["id"])
new_domain = item.get("domain", "")
new_category = item.get("category", "")
new_audience = item.get("target_audience", [])
if not isinstance(new_audience, list):
new_audience = []
# Build new control_id from domain if domain changed
old_prefix = ctrl["control_id"].split("-")[0] if ctrl["control_id"] else ""
new_prefix = new_domain.upper()[:4] if new_domain else old_prefix
if not req.dry_run:
update_parts = []
update_params: dict = {"ctrl_id": ctrl_id}
if new_category:
update_parts.append("category = :category")
update_params["category"] = new_category
if new_audience:
update_parts.append("target_audience = :target_audience")
update_params["target_audience"] = json.dumps(new_audience)
# Note: We do NOT rename control_ids here — that would
# break references and cause unique constraint violations.
if update_parts:
update_parts.append("updated_at = NOW()")
db.execute(
text(f"UPDATE canonical_controls SET {', '.join(update_parts)} WHERE id = CAST(:ctrl_id AS uuid)"),
update_params,
)
updated += 1
if not req.dry_run:
db.commit()
except Exception as e:
errors.append(f"Batch {batch_start}: {str(e)}")
db.rollback()
_domain_backfill_status[backfill_id] = {
"status": "running", "total": total, "updated": updated,
"progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}",
"errors": errors[-10:],
}
_domain_backfill_status[backfill_id] = {
"status": "completed", "total": total, "updated": updated,
"errors": errors[-50:],
}
logger.info("Domain backfill %s completed: %d/%d updated", backfill_id, updated, total)
except Exception as e:
logger.error("Domain backfill %s failed: %s", backfill_id, e)
_domain_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
finally:
db.close()
@router.post("/generate/backfill-domain")
async def start_domain_backfill(req: DomainBackfillRequest):
"""Backfill domain, category, and target_audience for controls using Anthropic API.
Finds controls where target_audience is NULL and enriches them.
Default is dry_run=True (preview only).
"""
import uuid
backfill_id = str(uuid.uuid4())[:8]
_domain_backfill_status[backfill_id] = {"status": "starting"}
asyncio.create_task(_run_domain_backfill(req, backfill_id))
return {"status": "running", "backfill_id": backfill_id,
"message": f"Domain backfill started. Poll /generate/backfill-status/{backfill_id}"}
@router.get("/generate/domain-backfill-status/{backfill_id}")
async def get_domain_backfill_status(backfill_id: str):
"""Get status of a domain backfill job."""
status = _domain_backfill_status.get(backfill_id)
if not status:
raise HTTPException(status_code=404, detail="Domain backfill job not found")
return status
# ---------------------------------------------------------------------------
# Source-Type Backfill — Classify law vs guideline vs standard vs restricted
# ---------------------------------------------------------------------------
class SourceTypeBackfillRequest(BaseModel):
dry_run: bool = True
_source_type_backfill_status: dict = {}
async def _run_source_type_backfill(dry_run: bool, backfill_id: str):
"""Backfill source_type into source_citation JSONB for all controls."""
db = SessionLocal()
try:
# Find controls with source_citation that lack source_type
rows = db.execute(text("""
SELECT control_id, source_citation, generation_metadata
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND (source_citation->>'source_type' IS NULL
OR source_citation->>'source_type' = '')
""")).fetchall()
total = len(rows)
updated = 0
already_correct = 0
errors = []
_source_type_backfill_status[backfill_id] = {
"status": "running", "total": total, "updated": 0, "dry_run": dry_run,
}
for row in rows:
cid = row[0]
citation = row[1] if isinstance(row[1], dict) else json.loads(row[1] or "{}")
metadata = row[2] if isinstance(row[2], dict) else json.loads(row[2] or "{}")
# Get regulation_code from metadata
reg_code = metadata.get("source_regulation", "")
if not reg_code:
# Try to infer from source name
errors.append(f"{cid}: no source_regulation in metadata")
continue
# Classify
license_info = _classify_regulation(reg_code)
source_type = license_info.get("source_type", "restricted")
# Update citation
citation["source_type"] = source_type
if not dry_run:
db.execute(text("""
UPDATE compliance.canonical_controls
SET source_citation = :citation
WHERE control_id = :cid
"""), {"citation": json.dumps(citation), "cid": cid})
if updated % 100 == 0:
db.commit()
updated += 1
if not dry_run:
db.commit()
# Count distribution
dist_query = db.execute(text("""
SELECT source_citation->>'source_type' as st, COUNT(*)
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND source_citation->>'source_type' IS NOT NULL
GROUP BY st
""")).fetchall() if not dry_run else []
distribution = {r[0]: r[1] for r in dist_query}
_source_type_backfill_status[backfill_id] = {
"status": "completed", "total": total, "updated": updated,
"dry_run": dry_run, "distribution": distribution,
"errors": errors[:50],
}
logger.info("Source-type backfill %s completed: %d/%d updated (dry_run=%s)",
backfill_id, updated, total, dry_run)
except Exception as e:
logger.error("Source-type backfill %s failed: %s", backfill_id, e)
_source_type_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
finally:
db.close()
@router.post("/generate/backfill-source-type")
async def start_source_type_backfill(req: SourceTypeBackfillRequest):
"""Backfill source_type (law/guideline/standard/restricted) into source_citation JSONB.
Classifies each control's source as binding law, authority guideline,
voluntary standard, or restricted norm based on regulation_code.
Default is dry_run=True (preview only).
"""
import uuid
backfill_id = str(uuid.uuid4())[:8]
_source_type_backfill_status[backfill_id] = {"status": "starting"}
asyncio.create_task(_run_source_type_backfill(req.dry_run, backfill_id))
return {
"status": "running",
"backfill_id": backfill_id,
"message": f"Source-type backfill started. Poll /generate/source-type-backfill-status/{backfill_id}",
}
@router.get("/generate/source-type-backfill-status/{backfill_id}")
async def get_source_type_backfill_status(backfill_id: str):
"""Get status of a source-type backfill job."""
status = _source_type_backfill_status.get(backfill_id)
if not status:
raise HTTPException(status_code=404, detail="Source-type backfill job not found")
return status
@@ -0,0 +1,856 @@
"""
FastAPI routes for the Multi-Layer Control Architecture.
Pattern Library, Obligation Extraction, Crosswalk Matrix, and Migration endpoints.
Endpoints:
GET /v1/canonical/patterns — All patterns (with filters)
GET /v1/canonical/patterns/{pattern_id} — Single pattern
GET /v1/canonical/patterns/{pattern_id}/controls — Controls for a pattern
POST /v1/canonical/obligations/extract — Extract obligations from text
GET /v1/canonical/crosswalk — Query crosswalk matrix
GET /v1/canonical/crosswalk/stats — Coverage statistics
POST /v1/canonical/migrate/decompose — Pass 0a: Obligation extraction
POST /v1/canonical/migrate/merge-obligations — Merge implementation-level dupes
POST /v1/canonical/migrate/enrich-obligations — Add trigger_type, impl metadata
POST /v1/canonical/migrate/compose-atomic — Pass 0b: Atomic control composition
POST /v1/canonical/migrate/link-obligations — Pass 1: Obligation linkage
POST /v1/canonical/migrate/classify-patterns — Pass 2: Pattern classification
POST /v1/canonical/migrate/triage — Pass 3: Quality triage
POST /v1/canonical/migrate/backfill-crosswalk — Pass 4: Crosswalk backfill
POST /v1/canonical/migrate/deduplicate — Pass 5: Deduplication
GET /v1/canonical/migrate/status — Migration progress
GET /v1/canonical/migrate/decomposition-status — Decomposition progress
"""
import json
import logging
from typing import Optional, List
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/canonical", tags=["crosswalk"])
# =============================================================================
# REQUEST / RESPONSE MODELS
# =============================================================================
class PatternResponse(BaseModel):
id: str
name: str
name_de: str
domain: str
category: str
description: str
objective_template: str
severity_default: str
implementation_effort_default: str = "m"
tags: list = []
composable_with: list = []
open_anchor_refs: list = []
controls_count: int = 0
class PatternListResponse(BaseModel):
patterns: List[PatternResponse]
total: int
class PatternDetailResponse(PatternResponse):
rationale_template: str = ""
requirements_template: list = []
test_procedure_template: list = []
evidence_template: list = []
obligation_match_keywords: list = []
class ObligationExtractRequest(BaseModel):
text: str
regulation_code: Optional[str] = None
article: Optional[str] = None
paragraph: Optional[str] = None
class ObligationExtractResponse(BaseModel):
obligation_id: Optional[str] = None
obligation_title: Optional[str] = None
obligation_text: Optional[str] = None
method: str = "none"
confidence: float = 0.0
regulation_id: Optional[str] = None
pattern_id: Optional[str] = None
pattern_confidence: float = 0.0
class CrosswalkRow(BaseModel):
regulation_code: str = ""
article: Optional[str] = None
obligation_id: Optional[str] = None
pattern_id: Optional[str] = None
master_control_id: Optional[str] = None
confidence: float = 0.0
source: str = "auto"
class CrosswalkQueryResponse(BaseModel):
rows: List[CrosswalkRow]
total: int
class CrosswalkStatsResponse(BaseModel):
total_rows: int = 0
regulations_covered: int = 0
obligations_linked: int = 0
patterns_used: int = 0
controls_linked: int = 0
coverage_by_regulation: dict = {}
class MigrationRequest(BaseModel):
limit: int = 0 # 0 = no limit
batch_size: int = 0 # 0 = auto (5 for Anthropic, 1 for Ollama)
use_anthropic: bool = False # Use Anthropic API instead of Ollama
category_filter: Optional[str] = None # Comma-separated categories
source_filter: Optional[str] = None # Comma-separated source regulations (ILIKE match)
class BatchSubmitRequest(BaseModel):
limit: int = 0
batch_size: int = 5
category_filter: Optional[str] = None
source_filter: Optional[str] = None
class BatchProcessRequest(BaseModel):
batch_id: str
pass_type: str = "0a" # "0a" or "0b"
class MigrationResponse(BaseModel):
status: str = "completed"
stats: dict = {}
class MigrationStatusResponse(BaseModel):
total_controls: int = 0
has_obligation: int = 0
has_pattern: int = 0
fully_linked: int = 0
deprecated: int = 0
coverage_obligation_pct: float = 0.0
coverage_pattern_pct: float = 0.0
coverage_full_pct: float = 0.0
class DecompositionStatusResponse(BaseModel):
rich_controls: int = 0
decomposed_controls: int = 0
total_candidates: int = 0
validated: int = 0
rejected: int = 0
composed: int = 0
atomic_controls: int = 0
merged: int = 0
enriched: int = 0
ready_for_pass0b: int = 0
decomposition_pct: float = 0.0
composition_pct: float = 0.0
# =============================================================================
# PATTERN LIBRARY ENDPOINTS
# =============================================================================
@router.get("/patterns", response_model=PatternListResponse)
async def list_patterns(
domain: Optional[str] = Query(None, description="Filter by domain (e.g. AUTH, CRYP)"),
category: Optional[str] = Query(None, description="Filter by category"),
tag: Optional[str] = Query(None, description="Filter by tag"),
):
"""List all control patterns with optional filters."""
from compliance.services.pattern_matcher import PatternMatcher
matcher = PatternMatcher()
matcher._load_patterns()
matcher._build_keyword_index()
patterns = matcher._patterns
if domain:
patterns = [p for p in patterns if p.domain == domain.upper()]
if category:
patterns = [p for p in patterns if p.category == category.lower()]
if tag:
patterns = [p for p in patterns if tag.lower() in [t.lower() for t in p.tags]]
# Count controls per pattern from DB
control_counts = _get_pattern_control_counts()
response_patterns = []
for p in patterns:
response_patterns.append(PatternResponse(
id=p.id,
name=p.name,
name_de=p.name_de,
domain=p.domain,
category=p.category,
description=p.description,
objective_template=p.objective_template,
severity_default=p.severity_default,
implementation_effort_default=p.implementation_effort_default,
tags=p.tags,
composable_with=p.composable_with,
open_anchor_refs=p.open_anchor_refs,
controls_count=control_counts.get(p.id, 0),
))
return PatternListResponse(patterns=response_patterns, total=len(response_patterns))
@router.get("/patterns/{pattern_id}", response_model=PatternDetailResponse)
async def get_pattern(pattern_id: str):
"""Get a single control pattern by ID."""
from compliance.services.pattern_matcher import PatternMatcher
matcher = PatternMatcher()
matcher._load_patterns()
pattern = matcher.get_pattern(pattern_id)
if not pattern:
raise HTTPException(status_code=404, detail=f"Pattern {pattern_id} not found")
control_counts = _get_pattern_control_counts()
return PatternDetailResponse(
id=pattern.id,
name=pattern.name,
name_de=pattern.name_de,
domain=pattern.domain,
category=pattern.category,
description=pattern.description,
objective_template=pattern.objective_template,
rationale_template=pattern.rationale_template,
requirements_template=pattern.requirements_template,
test_procedure_template=pattern.test_procedure_template,
evidence_template=pattern.evidence_template,
severity_default=pattern.severity_default,
implementation_effort_default=pattern.implementation_effort_default,
tags=pattern.tags,
composable_with=pattern.composable_with,
open_anchor_refs=pattern.open_anchor_refs,
obligation_match_keywords=pattern.obligation_match_keywords,
controls_count=control_counts.get(pattern.id, 0),
)
@router.get("/patterns/{pattern_id}/controls")
async def get_pattern_controls(
pattern_id: str,
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
):
"""Get controls generated from a specific pattern."""
db = SessionLocal()
try:
result = db.execute(
text("""
SELECT id, control_id, title, objective, severity,
release_state, category, obligation_ids
FROM canonical_controls
WHERE pattern_id = :pattern_id
AND release_state NOT IN ('deprecated')
ORDER BY control_id
LIMIT :limit OFFSET :offset
"""),
{"pattern_id": pattern_id.upper(), "limit": limit, "offset": offset},
)
rows = result.fetchall()
count_result = db.execute(
text("""
SELECT count(*) FROM canonical_controls
WHERE pattern_id = :pattern_id
AND release_state NOT IN ('deprecated')
"""),
{"pattern_id": pattern_id.upper()},
)
total = count_result.fetchone()[0]
controls = []
for row in rows:
obl_ids = row[7]
if isinstance(obl_ids, str):
try:
obl_ids = json.loads(obl_ids)
except (json.JSONDecodeError, TypeError):
obl_ids = []
controls.append({
"id": str(row[0]),
"control_id": row[1],
"title": row[2],
"objective": row[3],
"severity": row[4],
"release_state": row[5],
"category": row[6],
"obligation_ids": obl_ids or [],
})
return {"controls": controls, "total": total}
finally:
db.close()
# =============================================================================
# OBLIGATION EXTRACTION ENDPOINT
# =============================================================================
@router.post("/obligations/extract", response_model=ObligationExtractResponse)
async def extract_obligation(req: ObligationExtractRequest):
"""Extract obligation from text using 3-tier strategy, then match to pattern."""
from compliance.services.obligation_extractor import ObligationExtractor
from compliance.services.pattern_matcher import PatternMatcher
extractor = ObligationExtractor()
await extractor.initialize()
obligation = await extractor.extract(
chunk_text=req.text,
regulation_code=req.regulation_code or "",
article=req.article,
paragraph=req.paragraph,
)
# Also match to pattern
matcher = PatternMatcher()
matcher._load_patterns()
matcher._build_keyword_index()
pattern_text = obligation.obligation_text or obligation.obligation_title or req.text[:500]
pattern_result = matcher._tier1_keyword(pattern_text, obligation.regulation_id)
return ObligationExtractResponse(
obligation_id=obligation.obligation_id,
obligation_title=obligation.obligation_title,
obligation_text=obligation.obligation_text,
method=obligation.method,
confidence=obligation.confidence,
regulation_id=obligation.regulation_id,
pattern_id=pattern_result.pattern_id if pattern_result else None,
pattern_confidence=pattern_result.confidence if pattern_result else 0,
)
# =============================================================================
# CROSSWALK MATRIX ENDPOINTS
# =============================================================================
@router.get("/crosswalk", response_model=CrosswalkQueryResponse)
async def query_crosswalk(
regulation_code: Optional[str] = Query(None),
article: Optional[str] = Query(None),
obligation_id: Optional[str] = Query(None),
pattern_id: Optional[str] = Query(None),
limit: int = Query(100, ge=1, le=1000),
offset: int = Query(0, ge=0),
):
"""Query the crosswalk matrix with filters."""
db = SessionLocal()
try:
conditions = ["1=1"]
params = {"limit": limit, "offset": offset}
if regulation_code:
conditions.append("regulation_code = :reg")
params["reg"] = regulation_code
if article:
conditions.append("article = :art")
params["art"] = article
if obligation_id:
conditions.append("obligation_id = :obl")
params["obl"] = obligation_id
if pattern_id:
conditions.append("pattern_id = :pat")
params["pat"] = pattern_id
where = " AND ".join(conditions)
result = db.execute(
text(f"""
SELECT regulation_code, article, obligation_id,
pattern_id, master_control_id, confidence, source
FROM crosswalk_matrix
WHERE {where}
ORDER BY regulation_code, article
LIMIT :limit OFFSET :offset
"""),
params,
)
rows = result.fetchall()
count_result = db.execute(
text(f"SELECT count(*) FROM crosswalk_matrix WHERE {where}"),
params,
)
total = count_result.fetchone()[0]
crosswalk_rows = [
CrosswalkRow(
regulation_code=r[0] or "",
article=r[1],
obligation_id=r[2],
pattern_id=r[3],
master_control_id=r[4],
confidence=float(r[5] or 0),
source=r[6] or "auto",
)
for r in rows
]
return CrosswalkQueryResponse(rows=crosswalk_rows, total=total)
finally:
db.close()
@router.get("/crosswalk/stats", response_model=CrosswalkStatsResponse)
async def crosswalk_stats():
"""Get crosswalk coverage statistics."""
db = SessionLocal()
try:
row = db.execute(text("""
SELECT
count(*) AS total,
count(DISTINCT regulation_code) FILTER (WHERE regulation_code != '') AS regs,
count(DISTINCT obligation_id) FILTER (WHERE obligation_id IS NOT NULL) AS obls,
count(DISTINCT pattern_id) FILTER (WHERE pattern_id IS NOT NULL) AS pats,
count(DISTINCT master_control_id) FILTER (WHERE master_control_id IS NOT NULL) AS ctrls
FROM crosswalk_matrix
""")).fetchone()
# Coverage by regulation
reg_rows = db.execute(text("""
SELECT regulation_code, count(*) AS cnt
FROM crosswalk_matrix
WHERE regulation_code != ''
GROUP BY regulation_code
ORDER BY cnt DESC
""")).fetchall()
coverage = {r[0]: r[1] for r in reg_rows}
return CrosswalkStatsResponse(
total_rows=row[0],
regulations_covered=row[1],
obligations_linked=row[2],
patterns_used=row[3],
controls_linked=row[4],
coverage_by_regulation=coverage,
)
finally:
db.close()
# =============================================================================
# MIGRATION ENDPOINTS
# =============================================================================
@router.post("/migrate/decompose", response_model=MigrationResponse)
async def migrate_decompose(req: MigrationRequest):
"""Pass 0a: Extract obligation candidates from rich controls.
With use_anthropic=true, uses Anthropic API with prompt caching
and content batching (multiple controls per API call).
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = await decomp.run_pass0a(
limit=req.limit,
batch_size=req.batch_size,
use_anthropic=req.use_anthropic,
category_filter=req.category_filter,
source_filter=req.source_filter,
)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Decomposition pass 0a failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/merge-obligations", response_model=MigrationResponse)
async def migrate_merge_obligations():
"""Merge implementation-level duplicate obligations within each parent.
Run AFTER Pass 0a, BEFORE Pass 0b. No LLM calls — rule-based.
Merges obligations that share similar action+object into the more
abstract survivor, marking the concrete duplicate as 'merged'.
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = decomp.run_merge_pass()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Merge pass failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/enrich-obligations", response_model=MigrationResponse)
async def migrate_enrich_obligations():
"""Add trigger_type and is_implementation_specific metadata.
Run AFTER merge pass, BEFORE Pass 0b. No LLM calls — rule-based.
Classifies trigger_type (event/periodic/continuous) from obligation text
and detects implementation-specific obligations (concrete tools/protocols).
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = decomp.enrich_obligations()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Enrich pass failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/compose-atomic", response_model=MigrationResponse)
async def migrate_compose_atomic(req: MigrationRequest):
"""Pass 0b: Compose atomic controls from obligation candidates.
With use_anthropic=true, uses Anthropic API with prompt caching
and content batching (multiple obligations per API call).
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = await decomp.run_pass0b(
limit=req.limit,
batch_size=req.batch_size,
use_anthropic=req.use_anthropic,
)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Decomposition pass 0b failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/batch-submit-0a", response_model=MigrationResponse)
async def batch_submit_pass0a(req: BatchSubmitRequest):
"""Submit Pass 0a as Anthropic Batch API job (50% cost reduction).
Returns a batch_id for polling. Results are processed asynchronously
within 24 hours by Anthropic.
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
result = await decomp.submit_batch_pass0a(
limit=req.limit,
batch_size=req.batch_size,
category_filter=req.category_filter,
source_filter=req.source_filter,
)
return MigrationResponse(status=result.pop("status", "submitted"), stats=result)
except Exception as e:
logger.error("Batch submit 0a failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/batch-submit-0b", response_model=MigrationResponse)
async def batch_submit_pass0b(req: BatchSubmitRequest):
"""Submit Pass 0b as Anthropic Batch API job (50% cost reduction)."""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
result = await decomp.submit_batch_pass0b(
limit=req.limit,
batch_size=req.batch_size,
)
return MigrationResponse(status=result.pop("status", "submitted"), stats=result)
except Exception as e:
logger.error("Batch submit 0b failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.get("/migrate/batch-status/{batch_id}")
async def batch_check_status(batch_id: str):
"""Check processing status of an Anthropic batch job."""
from compliance.services.decomposition_pass import check_batch_status
try:
status = await check_batch_status(batch_id)
return status
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/migrate/batch-process", response_model=MigrationResponse)
async def batch_process_results(req: BatchProcessRequest):
"""Fetch and process results from a completed Anthropic batch.
Call this after batch-status shows processing_status='ended'.
"""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
stats = await decomp.process_batch_results(
batch_id=req.batch_id,
pass_type=req.pass_type,
)
return MigrationResponse(status=stats.pop("status", "completed"), stats=stats)
except Exception as e:
logger.error("Batch process failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/link-obligations", response_model=MigrationResponse)
async def migrate_link_obligations(req: MigrationRequest):
"""Pass 1: Link controls to obligations via source_citation article."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
await migration.initialize()
stats = await migration.run_pass1_obligation_linkage(limit=req.limit)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 1 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/classify-patterns", response_model=MigrationResponse)
async def migrate_classify_patterns(req: MigrationRequest):
"""Pass 2: Classify controls into patterns via keyword matching."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
await migration.initialize()
stats = await migration.run_pass2_pattern_classification(limit=req.limit)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 2 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/triage", response_model=MigrationResponse)
async def migrate_triage():
"""Pass 3: Quality triage — categorize by linkage completeness."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
stats = migration.run_pass3_quality_triage()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 3 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/backfill-crosswalk", response_model=MigrationResponse)
async def migrate_backfill_crosswalk():
"""Pass 4: Create crosswalk rows for linked controls."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
stats = migration.run_pass4_crosswalk_backfill()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 4 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.post("/migrate/deduplicate", response_model=MigrationResponse)
async def migrate_deduplicate():
"""Pass 5: Mark duplicate controls (same obligation + pattern)."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
stats = migration.run_pass5_deduplication()
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Migration pass 5 failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.get("/migrate/status", response_model=MigrationStatusResponse)
async def migration_status():
"""Get overall migration progress."""
from compliance.services.pipeline_adapter import MigrationPasses
db = SessionLocal()
try:
migration = MigrationPasses(db=db)
status = migration.migration_status()
return MigrationStatusResponse(**status)
except Exception as e:
logger.error("Migration status failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.get("/migrate/decomposition-status", response_model=DecompositionStatusResponse)
async def decomposition_status():
"""Get decomposition progress (Pass 0a/0b)."""
from compliance.services.decomposition_pass import DecompositionPass
db = SessionLocal()
try:
decomp = DecompositionPass(db=db)
status = decomp.decomposition_status()
return DecompositionStatusResponse(**status)
except Exception as e:
logger.error("Decomposition status failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
# =============================================================================
# BATCH DEDUP ENDPOINTS
# =============================================================================
# Module-level runner reference for status polling
_batch_dedup_runner = None
@router.post("/migrate/batch-dedup", response_model=MigrationResponse)
async def migrate_batch_dedup(
dry_run: bool = Query(False, description="Preview mode — no DB changes"),
hint_filter: Optional[str] = Query(None, description="Only process hints matching this prefix"),
):
"""Batch dedup: reduce ~85k Pass 0b controls to ~18-25k masters.
Phase 1: Groups by merge_group_hint, picks best quality master, links rest.
Phase 2: Cross-group embedding search for semantically similar masters.
"""
global _batch_dedup_runner
from compliance.services.batch_dedup_runner import BatchDedupRunner
db = SessionLocal()
try:
runner = BatchDedupRunner(db=db)
_batch_dedup_runner = runner
stats = await runner.run(dry_run=dry_run, hint_filter=hint_filter)
return MigrationResponse(status="completed", stats=stats)
except Exception as e:
logger.error("Batch dedup failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
_batch_dedup_runner = None
db.close()
@router.get("/migrate/batch-dedup/status")
async def batch_dedup_status():
"""Get current batch dedup progress (while running)."""
if _batch_dedup_runner is not None:
return {"running": True, **_batch_dedup_runner.get_status()}
# Not running — show DB stats
db = SessionLocal()
try:
row = db.execute(text("""
SELECT
count(*) FILTER (WHERE decomposition_method = 'pass0b') AS total_pass0b,
count(*) FILTER (WHERE decomposition_method = 'pass0b'
AND release_state = 'duplicate') AS duplicates,
count(*) FILTER (WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate'
AND release_state != 'deprecated') AS masters
FROM canonical_controls
""")).fetchone()
review_count = db.execute(text(
"SELECT count(*) FROM control_dedup_reviews WHERE review_status = 'pending'"
)).fetchone()[0]
return {
"running": False,
"total_pass0b": row[0],
"duplicates": row[1],
"masters": row[2],
"pending_reviews": review_count,
}
finally:
db.close()
# =============================================================================
# HELPERS
# =============================================================================
def _get_pattern_control_counts() -> dict[str, int]:
"""Get count of controls per pattern_id from DB."""
db = SessionLocal()
try:
result = db.execute(text("""
SELECT pattern_id, count(*) AS cnt
FROM canonical_controls
WHERE pattern_id IS NOT NULL AND pattern_id != ''
AND release_state NOT IN ('deprecated')
GROUP BY pattern_id
"""))
return {row[0]: row[1] for row in result.fetchall()}
except Exception:
return {}
finally:
db.close()
@@ -5,16 +5,23 @@ Endpoints:
- /dashboard: Main compliance dashboard
- /dashboard/executive: Executive summary for managers
- /dashboard/trend: Compliance score trend over time
- /dashboard/roadmap: Prioritised controls in 4 buckets
- /dashboard/module-status: Completion status of each SDK module
- /dashboard/next-actions: Top 5 most important actions
- /dashboard/snapshot: Save / query compliance score snapshots
- /score: Quick compliance score
- /reports: Report generation
"""
import logging
from datetime import datetime, timedelta, timezone
from datetime import datetime, date, timedelta
from calendar import month_abbr
from typing import Optional
from typing import Optional, Dict, Any, List
from decimal import Decimal
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
@@ -25,15 +32,24 @@ from ..db import (
ControlRepository,
EvidenceRepository,
RiskRepository,
AssertionDB,
)
from .schemas import (
DashboardResponse,
MultiDimensionalScore,
ExecutiveDashboardResponse,
TrendDataPoint,
RiskSummary,
DeadlineItem,
TeamWorkloadItem,
TraceabilityAssertion,
TraceabilityEvidence,
TraceabilityCoverage,
TraceabilityControl,
TraceabilityMatrixResponse,
)
from .tenant_utils import get_tenant_id as _get_tenant_id
from .db_utils import row_to_dict as _row_to_dict
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-dashboard"])
@@ -86,6 +102,14 @@ async def get_dashboard(db: Session = Depends(get_db)):
# or compute from by_status dict
score = ctrl_stats.get("compliance_score", 0.0)
# Multi-dimensional score (Anti-Fake-Evidence)
try:
ms = ctrl_repo.get_multi_dimensional_score()
multi_score = MultiDimensionalScore(**ms)
except Exception as e:
logger.warning(f"Failed to compute multi-dimensional score: {e}")
multi_score = None
return DashboardResponse(
compliance_score=round(score, 1),
total_regulations=len(regulations),
@@ -98,6 +122,7 @@ async def get_dashboard(db: Session = Depends(get_db)):
total_risks=len(risks),
risks_by_level=risks_by_level,
recent_activity=[],
multi_score=multi_score,
)
@@ -116,11 +141,18 @@ async def get_compliance_score(db: Session = Depends(get_db)):
else:
score = 0
# Multi-dimensional score (Anti-Fake-Evidence)
try:
multi_score = ctrl_repo.get_multi_dimensional_score()
except Exception:
multi_score = None
return {
"score": round(score, 1),
"total_controls": total,
"passing_controls": passing,
"partial_controls": partial,
"multi_score": multi_score,
}
@@ -322,6 +354,424 @@ async def get_compliance_trend(
}
# ============================================================================
# Dashboard Extended — Roadmap, Module-Status, Next-Actions, Snapshots
# ============================================================================
# Weight map for control prioritisation
_PRIORITY_WEIGHTS = {"legal": 5, "security": 3, "best_practice": 1, "operational": 2}
# SDK module definitions → DB table used for counting completion
_MODULE_DEFS: List[Dict[str, str]] = [
{"key": "vvt", "label": "VVT", "table": "compliance_vvt_activities"},
{"key": "tom", "label": "TOM", "table": "compliance_toms"},
{"key": "dsfa", "label": "DSFA", "table": "compliance_dsfa_assessments"},
{"key": "loeschfristen", "label": "Loeschfristen", "table": "compliance_loeschfristen"},
{"key": "risks", "label": "Risiken", "table": "compliance_risks"},
{"key": "controls", "label": "Controls", "table": "compliance_controls"},
{"key": "evidence", "label": "Nachweise", "table": "compliance_evidence"},
{"key": "obligations", "label": "Pflichten", "table": "compliance_obligations"},
{"key": "incidents", "label": "Vorfaelle", "table": "compliance_notfallplan_incidents"},
{"key": "vendor", "label": "Auftragsverarbeiter", "table": "compliance_vendor_assessments"},
{"key": "legal_templates", "label": "Rechtl. Dokumente", "table": "compliance_legal_templates"},
{"key": "training", "label": "Schulungen", "table": "training_modules"},
{"key": "audit", "label": "Audit", "table": "compliance_audit_sessions"},
{"key": "security_backlog", "label": "Security-Backlog", "table": "compliance_security_backlog"},
{"key": "quality", "label": "Qualitaet", "table": "compliance_quality_items"},
]
@router.get("/dashboard/roadmap")
async def get_dashboard_roadmap(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Prioritised controls in 4 buckets: Quick Wins, Must Have, Should Have, Nice to Have."""
ctrl_repo = ControlRepository(db)
controls = ctrl_repo.get_all()
today = datetime.utcnow().date()
buckets: Dict[str, list] = {
"quick_wins": [],
"must_have": [],
"should_have": [],
"nice_to_have": [],
}
for ctrl in controls:
status = ctrl.status.value if ctrl.status else "planned"
if status == "pass":
continue # already done
weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
days_overdue = 0
if ctrl.next_review_at:
review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
days_overdue = (today - review_date).days
urgency = weight * 2 + (1 if days_overdue > 0 else 0)
item = {
"id": str(ctrl.id),
"control_id": ctrl.control_id,
"title": ctrl.title,
"status": status,
"domain": ctrl.domain.value if ctrl.domain else "unknown",
"owner": ctrl.owner,
"next_review_at": ctrl.next_review_at.isoformat() if ctrl.next_review_at else None,
"days_overdue": max(0, days_overdue),
"weight": weight,
}
if weight >= 5 and days_overdue > 0:
buckets["quick_wins"].append(item)
elif weight >= 4:
buckets["must_have"].append(item)
elif weight >= 2:
buckets["should_have"].append(item)
else:
buckets["nice_to_have"].append(item)
# Sort each bucket by urgency desc
for key in buckets:
buckets[key].sort(key=lambda x: x["days_overdue"], reverse=True)
return {
"buckets": buckets,
"counts": {k: len(v) for k, v in buckets.items()},
"generated_at": datetime.utcnow().isoformat(),
}
@router.get("/dashboard/module-status")
async def get_module_status(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Completion status for each SDK module based on DB record counts."""
modules = []
for mod in _MODULE_DEFS:
try:
row = db.execute(
text(f"SELECT COUNT(*) FROM {mod['table']} WHERE tenant_id = :tid"),
{"tid": tenant_id},
).fetchone()
count = int(row[0]) if row else 0
except Exception:
count = 0
# Simple heuristic: 0 = not started, 1-2 = in progress, 3+ = complete
if count == 0:
status = "not_started"
progress = 0
elif count < 3:
status = "in_progress"
progress = min(60, count * 30)
else:
status = "complete"
progress = 100
modules.append({
"key": mod["key"],
"label": mod["label"],
"count": count,
"status": status,
"progress": progress,
})
started = sum(1 for m in modules if m["status"] != "not_started")
complete = sum(1 for m in modules if m["status"] == "complete")
return {
"modules": modules,
"total": len(modules),
"started": started,
"complete": complete,
"overall_progress": round((complete / len(modules)) * 100, 1) if modules else 0,
}
@router.get("/dashboard/next-actions")
async def get_next_actions(
limit: int = Query(5, ge=1, le=20),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Top N most important actions sorted by urgency*impact."""
ctrl_repo = ControlRepository(db)
controls = ctrl_repo.get_all()
today = datetime.utcnow().date()
actions = []
for ctrl in controls:
status = ctrl.status.value if ctrl.status else "planned"
if status == "pass":
continue
days_overdue = 0
if ctrl.next_review_at:
review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
days_overdue = max(0, (today - review_date).days)
weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
urgency_score = weight * 10 + days_overdue
actions.append({
"id": str(ctrl.id),
"control_id": ctrl.control_id,
"title": ctrl.title,
"status": status,
"domain": ctrl.domain.value if ctrl.domain else "unknown",
"owner": ctrl.owner,
"days_overdue": days_overdue,
"urgency_score": urgency_score,
"reason": "Ueberfaellig" if days_overdue > 0 else "Offen",
})
actions.sort(key=lambda x: x["urgency_score"], reverse=True)
return {"actions": actions[:limit]}
@router.post("/dashboard/snapshot")
async def create_score_snapshot(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Save current compliance score as a historical snapshot."""
ctrl_repo = ControlRepository(db)
evidence_repo = EvidenceRepository(db)
risk_repo = RiskRepository(db)
ctrl_stats = ctrl_repo.get_statistics()
evidence_stats = evidence_repo.get_statistics()
risks = risk_repo.get_all()
total = ctrl_stats.get("total", 0)
passing = ctrl_stats.get("pass", 0)
partial = ctrl_stats.get("partial", 0)
score = round(((passing + partial * 0.5) / total) * 100, 2) if total > 0 else 0
risks_high = sum(1 for r in risks if (r.inherent_risk.value if r.inherent_risk else "low") in ("high", "critical"))
today = date.today()
row = db.execute(text("""
INSERT INTO compliance_score_snapshots (
tenant_id, score, controls_total, controls_pass, controls_partial,
evidence_total, evidence_valid, risks_total, risks_high, snapshot_date
) VALUES (
:tenant_id, :score, :controls_total, :controls_pass, :controls_partial,
:evidence_total, :evidence_valid, :risks_total, :risks_high, :snapshot_date
)
ON CONFLICT (tenant_id, project_id, snapshot_date) DO UPDATE SET
score = EXCLUDED.score,
controls_total = EXCLUDED.controls_total,
controls_pass = EXCLUDED.controls_pass,
controls_partial = EXCLUDED.controls_partial,
evidence_total = EXCLUDED.evidence_total,
evidence_valid = EXCLUDED.evidence_valid,
risks_total = EXCLUDED.risks_total,
risks_high = EXCLUDED.risks_high
RETURNING *
"""), {
"tenant_id": tenant_id,
"score": score,
"controls_total": total,
"controls_pass": passing,
"controls_partial": partial,
"evidence_total": evidence_stats.get("total", 0),
"evidence_valid": evidence_stats.get("by_status", {}).get("valid", 0),
"risks_total": len(risks),
"risks_high": risks_high,
"snapshot_date": today,
}).fetchone()
db.commit()
return _row_to_dict(row)
@router.get("/dashboard/score-history")
async def get_score_history(
months: int = Query(12, ge=1, le=36),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Get compliance score history from snapshots."""
since = date.today() - timedelta(days=months * 30)
rows = db.execute(text("""
SELECT * FROM compliance_score_snapshots
WHERE tenant_id = :tenant_id AND snapshot_date >= :since
ORDER BY snapshot_date ASC
"""), {"tenant_id": tenant_id, "since": since}).fetchall()
snapshots = []
for r in rows:
d = _row_to_dict(r)
# Convert Decimal to float for JSON
if isinstance(d.get("score"), Decimal):
d["score"] = float(d["score"])
snapshots.append(d)
return {
"snapshots": snapshots,
"total": len(snapshots),
"period_months": months,
}
# ============================================================================
# Evidence Distribution (Anti-Fake-Evidence Phase 3)
# ============================================================================
@router.get("/dashboard/evidence-distribution")
async def get_evidence_distribution(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Evidence counts by confidence level and four-eyes status."""
evidence_repo = EvidenceRepository(db)
all_evidence = evidence_repo.get_all()
by_confidence = {"E0": 0, "E1": 0, "E2": 0, "E3": 0, "E4": 0}
four_eyes_pending = 0
for e in all_evidence:
level = e.confidence_level.value if e.confidence_level else "E1"
if level in by_confidence:
by_confidence[level] += 1
if e.requires_four_eyes and e.approval_status not in ("approved", "rejected"):
four_eyes_pending += 1
return {
"by_confidence": by_confidence,
"four_eyes_pending": four_eyes_pending,
"total": len(all_evidence),
}
# ============================================================================
# Traceability Matrix (Anti-Fake-Evidence Phase 4a)
# ============================================================================
@router.get("/dashboard/traceability-matrix", response_model=TraceabilityMatrixResponse)
async def get_traceability_matrix(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""
Full traceability chain: Control Evidence Assertions.
Loads each entity set once, builds in-memory indices, and nests
the result so the frontend can render a matrix view.
"""
ctrl_repo = ControlRepository(db)
evidence_repo = EvidenceRepository(db)
# 1. Load all three entity sets
controls = ctrl_repo.get_all()
all_evidence = evidence_repo.get_all()
all_assertions = db.query(AssertionDB).filter(
AssertionDB.entity_type == "evidence",
).all()
# 2. Index assertions by evidence_id (entity_id)
assertions_by_evidence: Dict[str, list] = {}
for a in all_assertions:
assertions_by_evidence.setdefault(a.entity_id, []).append(a)
# 3. Index evidence by control_id
evidence_by_control: Dict[str, list] = {}
for e in all_evidence:
evidence_by_control.setdefault(str(e.control_id), []).append(e)
# 4. Build nested response
result_controls: list = []
total_controls = 0
covered_controls = 0
fully_verified = 0
for ctrl in controls:
total_controls += 1
ctrl_id = str(ctrl.id)
ctrl_evidence = evidence_by_control.get(ctrl_id, [])
nested_evidence: list = []
has_evidence = len(ctrl_evidence) > 0
has_assertions = False
all_verified = True
min_conf: Optional[str] = None
conf_order = {"E0": 0, "E1": 1, "E2": 2, "E3": 3, "E4": 4}
for e in ctrl_evidence:
ev_id = str(e.id)
ev_assertions = assertions_by_evidence.get(ev_id, [])
nested_assertions = [
TraceabilityAssertion(
id=str(a.id),
sentence_text=a.sentence_text,
assertion_type=a.assertion_type or "assertion",
confidence=a.confidence or 0.0,
verified=a.verified_by is not None,
)
for a in ev_assertions
]
if nested_assertions:
has_assertions = True
for na in nested_assertions:
if not na.verified:
all_verified = False
conf = e.confidence_level.value if e.confidence_level else "E1"
if min_conf is None or conf_order.get(conf, 1) < conf_order.get(min_conf, 1):
min_conf = conf
nested_evidence.append(TraceabilityEvidence(
id=ev_id,
title=e.title,
evidence_type=e.evidence_type,
confidence_level=conf,
status=e.status.value if e.status else "valid",
assertions=nested_assertions,
))
if not has_assertions:
all_verified = False
if has_evidence:
covered_controls += 1
if has_evidence and has_assertions and all_verified:
fully_verified += 1
coverage = TraceabilityCoverage(
has_evidence=has_evidence,
has_assertions=has_assertions,
all_assertions_verified=all_verified,
min_confidence_level=min_conf,
)
result_controls.append(TraceabilityControl(
id=ctrl_id,
control_id=ctrl.control_id,
title=ctrl.title,
status=ctrl.status.value if ctrl.status else "planned",
domain=ctrl.domain.value if ctrl.domain else "unknown",
evidence=nested_evidence,
coverage=coverage,
))
summary = {
"total_controls": total_controls,
"covered_controls": covered_controls,
"fully_verified": fully_verified,
"uncovered_controls": total_controls - covered_controls,
}
return TraceabilityMatrixResponse(controls=result_controls, summary=summary)
# ============================================================================
# Reports
# ============================================================================
@@ -60,10 +60,314 @@ def get_dsfa_service(db: Session = Depends(get_db)) -> DSFAService:
return DSFAService(db)
def get_workflow_service(
db: Session = Depends(get_db),
) -> DSFAWorkflowService:
return DSFAWorkflowService(db)
# =============================================================================
# Pydantic Schemas
# =============================================================================
class DSFACreate(BaseModel):
title: str
description: str = ""
status: str = "draft"
risk_level: str = "low"
processing_activity: str = ""
data_categories: List[str] = []
recipients: List[str] = []
measures: List[str] = []
created_by: str = "system"
# Section 1
processing_description: Optional[str] = None
processing_purpose: Optional[str] = None
legal_basis: Optional[str] = None
legal_basis_details: Optional[str] = None
# Section 2
necessity_assessment: Optional[str] = None
proportionality_assessment: Optional[str] = None
data_minimization: Optional[str] = None
alternatives_considered: Optional[str] = None
retention_justification: Optional[str] = None
# Section 3
involves_ai: Optional[bool] = None
overall_risk_level: Optional[str] = None
risk_score: Optional[int] = None
# Section 6
dpo_consulted: Optional[bool] = None
dpo_name: Optional[str] = None
dpo_opinion: Optional[str] = None
dpo_approved: Optional[bool] = None
authority_consulted: Optional[bool] = None
authority_reference: Optional[str] = None
authority_decision: Optional[str] = None
# Metadata
version: Optional[int] = None
conclusion: Optional[str] = None
federal_state: Optional[str] = None
authority_resource_id: Optional[str] = None
submitted_by: Optional[str] = None
# JSONB Arrays
data_subjects: Optional[List[str]] = None
affected_rights: Optional[List[str]] = None
triggered_rule_codes: Optional[List[str]] = None
ai_trigger_ids: Optional[List[str]] = None
wp248_criteria_met: Optional[List[str]] = None
art35_abs3_triggered: Optional[List[str]] = None
tom_references: Optional[List[str]] = None
risks: Optional[List[dict]] = None
mitigations: Optional[List[dict]] = None
stakeholder_consultations: Optional[List[dict]] = None
review_triggers: Optional[List[dict]] = None
review_comments: Optional[List[dict]] = None
ai_use_case_modules: Optional[List[dict]] = None
section_8_complete: Optional[bool] = None
# JSONB Objects
threshold_analysis: Optional[dict] = None
consultation_requirement: Optional[dict] = None
review_schedule: Optional[dict] = None
section_progress: Optional[dict] = None
metadata: Optional[dict] = None
class DSFAUpdate(BaseModel):
title: Optional[str] = None
description: Optional[str] = None
status: Optional[str] = None
risk_level: Optional[str] = None
processing_activity: Optional[str] = None
data_categories: Optional[List[str]] = None
recipients: Optional[List[str]] = None
measures: Optional[List[str]] = None
approved_by: Optional[str] = None
# Section 1
processing_description: Optional[str] = None
processing_purpose: Optional[str] = None
legal_basis: Optional[str] = None
legal_basis_details: Optional[str] = None
# Section 2
necessity_assessment: Optional[str] = None
proportionality_assessment: Optional[str] = None
data_minimization: Optional[str] = None
alternatives_considered: Optional[str] = None
retention_justification: Optional[str] = None
# Section 3
involves_ai: Optional[bool] = None
overall_risk_level: Optional[str] = None
risk_score: Optional[int] = None
# Section 6
dpo_consulted: Optional[bool] = None
dpo_name: Optional[str] = None
dpo_opinion: Optional[str] = None
dpo_approved: Optional[bool] = None
authority_consulted: Optional[bool] = None
authority_reference: Optional[str] = None
authority_decision: Optional[str] = None
# Metadata
version: Optional[int] = None
conclusion: Optional[str] = None
federal_state: Optional[str] = None
authority_resource_id: Optional[str] = None
submitted_by: Optional[str] = None
# JSONB Arrays
data_subjects: Optional[List[str]] = None
affected_rights: Optional[List[str]] = None
triggered_rule_codes: Optional[List[str]] = None
ai_trigger_ids: Optional[List[str]] = None
wp248_criteria_met: Optional[List[str]] = None
art35_abs3_triggered: Optional[List[str]] = None
tom_references: Optional[List[str]] = None
risks: Optional[List[dict]] = None
mitigations: Optional[List[dict]] = None
stakeholder_consultations: Optional[List[dict]] = None
review_triggers: Optional[List[dict]] = None
review_comments: Optional[List[dict]] = None
ai_use_case_modules: Optional[List[dict]] = None
section_8_complete: Optional[bool] = None
# JSONB Objects
threshold_analysis: Optional[dict] = None
consultation_requirement: Optional[dict] = None
review_schedule: Optional[dict] = None
section_progress: Optional[dict] = None
metadata: Optional[dict] = None
class DSFAStatusUpdate(BaseModel):
status: str
approved_by: Optional[str] = None
class DSFASectionUpdate(BaseModel):
"""Body for PUT /dsfa/{id}/sections/{section_number}."""
content: Optional[str] = None
# Allow arbitrary extra fields so the frontend can send any section-specific data
extra: Optional[dict] = None
class DSFAApproveRequest(BaseModel):
"""Body for POST /dsfa/{id}/approve."""
approved: bool
comments: Optional[str] = None
approved_by: Optional[str] = None
# =============================================================================
# Helpers
# =============================================================================
def _get_tenant_id(tenant_id: Optional[str]) -> str:
return tenant_id or DEFAULT_TENANT_ID
def _dsfa_to_response(row) -> dict:
"""Convert a DB row to a JSON-serializable dict."""
import json
# SQLAlchemy 2.0: Row objects need ._mapping for string-key access
if hasattr(row, "_mapping"):
row = row._mapping
def _parse_arr(val):
"""Parse a JSONB array field → list."""
if val is None:
return []
if isinstance(val, list):
return val
if isinstance(val, str):
try:
parsed = json.loads(val)
return parsed if isinstance(parsed, list) else []
except Exception:
return []
return val
def _parse_obj(val):
"""Parse a JSONB object field → dict."""
if val is None:
return {}
if isinstance(val, dict):
return val
if isinstance(val, str):
try:
parsed = json.loads(val)
return parsed if isinstance(parsed, dict) else {}
except Exception:
return {}
return val
def _ts(val):
"""Timestamp → ISO string or None."""
if not val:
return None
if isinstance(val, str):
return val
return val.isoformat()
def _get(key, default=None):
"""Safe row access — returns default if key missing (handles old rows)."""
try:
v = row[key]
return default if v is None and default is not None else v
except (KeyError, IndexError):
return default
return {
# Core fields (always present since Migration 024)
"id": str(row["id"]),
"tenant_id": row["tenant_id"],
"title": row["title"],
"description": row["description"] or "",
"status": row["status"] or "draft",
"risk_level": row["risk_level"] or "low",
"processing_activity": row["processing_activity"] or "",
"data_categories": _parse_arr(row["data_categories"]),
"recipients": _parse_arr(row["recipients"]),
"measures": _parse_arr(row["measures"]),
"approved_by": row["approved_by"],
"approved_at": _ts(row["approved_at"]),
"created_by": row["created_by"] or "system",
"created_at": _ts(row["created_at"]),
"updated_at": _ts(row["updated_at"]),
# Section 1 (Migration 030)
"processing_description": _get("processing_description"),
"processing_purpose": _get("processing_purpose"),
"legal_basis": _get("legal_basis"),
"legal_basis_details": _get("legal_basis_details"),
# Section 2
"necessity_assessment": _get("necessity_assessment"),
"proportionality_assessment": _get("proportionality_assessment"),
"data_minimization": _get("data_minimization"),
"alternatives_considered": _get("alternatives_considered"),
"retention_justification": _get("retention_justification"),
# Section 3
"involves_ai": _get("involves_ai", False),
"overall_risk_level": _get("overall_risk_level"),
"risk_score": _get("risk_score", 0),
# Section 6
"dpo_consulted": _get("dpo_consulted", False),
"dpo_consulted_at": _ts(_get("dpo_consulted_at")),
"dpo_name": _get("dpo_name"),
"dpo_opinion": _get("dpo_opinion"),
"dpo_approved": _get("dpo_approved"),
"authority_consulted": _get("authority_consulted", False),
"authority_consulted_at": _ts(_get("authority_consulted_at")),
"authority_reference": _get("authority_reference"),
"authority_decision": _get("authority_decision"),
# Metadata / Versioning
"version": _get("version", 1),
"previous_version_id": str(_get("previous_version_id")) if _get("previous_version_id") else None,
"conclusion": _get("conclusion"),
"federal_state": _get("federal_state"),
"authority_resource_id": _get("authority_resource_id"),
"submitted_for_review_at": _ts(_get("submitted_for_review_at")),
"submitted_by": _get("submitted_by"),
# JSONB Arrays
"data_subjects": _parse_arr(_get("data_subjects")),
"affected_rights": _parse_arr(_get("affected_rights")),
"triggered_rule_codes": _parse_arr(_get("triggered_rule_codes")),
"ai_trigger_ids": _parse_arr(_get("ai_trigger_ids")),
"wp248_criteria_met": _parse_arr(_get("wp248_criteria_met")),
"art35_abs3_triggered": _parse_arr(_get("art35_abs3_triggered")),
"tom_references": _parse_arr(_get("tom_references")),
"risks": _parse_arr(_get("risks")),
"mitigations": _parse_arr(_get("mitigations")),
"stakeholder_consultations": _parse_arr(_get("stakeholder_consultations")),
"review_triggers": _parse_arr(_get("review_triggers")),
"review_comments": _parse_arr(_get("review_comments")),
# Section 8 / AI (Migration 028)
"ai_use_case_modules": _parse_arr(_get("ai_use_case_modules")),
"section_8_complete": _get("section_8_complete", False),
# JSONB Objects
"threshold_analysis": _parse_obj(_get("threshold_analysis")),
"consultation_requirement": _parse_obj(_get("consultation_requirement")),
"review_schedule": _parse_obj(_get("review_schedule")),
"section_progress": _parse_obj(_get("section_progress")),
"metadata": _parse_obj(_get("metadata")),
}
def _log_audit(
db: Session,
tenant_id: str,
dsfa_id,
action: str,
changed_by: str = "system",
old_values=None,
new_values=None,
):
import json
db.execute(
text("""
INSERT INTO compliance_dsfa_audit_log
(tenant_id, dsfa_id, action, changed_by, old_values, new_values)
VALUES
(:tenant_id, :dsfa_id, :action, :changed_by,
CAST(:old_values AS jsonb), CAST(:new_values AS jsonb))
"""),
{
"tenant_id": tenant_id,
"dsfa_id": str(dsfa_id) if dsfa_id else None,
"action": action,
"changed_by": changed_by,
"old_values": json.dumps(old_values) if old_values else None,
"new_values": json.dumps(new_values) if new_values else None,
},
)
# =============================================================================
@@ -177,8 +481,51 @@ async def create_dsfa(
service: DSFAService = Depends(get_dsfa_service),
) -> dict[str, Any]:
"""Neue DSFA erstellen."""
with translate_domain_errors():
return service.create(tenant_id, request)
import json
if request.status not in VALID_STATUSES:
raise HTTPException(status_code=422, detail=f"Ungültiger Status: {request.status}")
if request.risk_level not in VALID_RISK_LEVELS:
raise HTTPException(status_code=422, detail=f"Ungültiges Risiko-Level: {request.risk_level}")
tid = _get_tenant_id(tenant_id)
row = db.execute(
text("""
INSERT INTO compliance_dsfas
(tenant_id, title, description, status, risk_level,
processing_activity, data_categories, recipients, measures, created_by)
VALUES
(:tenant_id, :title, :description, :status, :risk_level,
:processing_activity,
CAST(:data_categories AS jsonb),
CAST(:recipients AS jsonb),
CAST(:measures AS jsonb),
:created_by)
RETURNING *
"""),
{
"tenant_id": tid,
"title": request.title,
"description": request.description,
"status": request.status,
"risk_level": request.risk_level,
"processing_activity": request.processing_activity,
"data_categories": json.dumps(request.data_categories),
"recipients": json.dumps(request.recipients),
"measures": json.dumps(request.measures),
"created_by": request.created_by,
},
).fetchone()
db.flush()
row_id = row._mapping["id"] if hasattr(row, "_mapping") else row[0]
_log_audit(
db, tid, row_id, "CREATE", request.created_by,
new_values={"title": request.title, "status": request.status},
)
db.commit()
return _dsfa_to_response(row)
# =============================================================================
File diff suppressed because it is too large Load Diff
@@ -22,23 +22,21 @@ from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from compliance.api._http_errors import translate_domain_errors
from compliance.db import ControlRepository, EvidenceRepository
from compliance.schemas.evidence import (
EvidenceCreate,
EvidenceListResponse,
EvidenceResponse,
from ..db import (
ControlRepository,
EvidenceRepository,
EvidenceStatusEnum,
EvidenceConfidenceEnum,
EvidenceTruthStatusEnum,
)
from compliance.services.auto_risk_updater import AutoRiskUpdater
from compliance.domain import NotFoundError, ValidationError
from compliance.services.evidence_service import (
SOURCE_CONTROL_MAP,
EvidenceService,
_extract_findings_detail, # re-exported for legacy test imports
_parse_ci_evidence, # re-exported for legacy test imports
_store_evidence, # re-exported for legacy test imports
_update_risks as _update_risks_impl,
from ..db.models import EvidenceDB, ControlDB, AuditTrailDB
from ..services.auto_risk_updater import AutoRiskUpdater
from .schemas import (
EvidenceCreate, EvidenceResponse, EvidenceListResponse,
EvidenceRejectRequest,
)
from .audit_trail_utils import log_audit_trail
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-evidence"])
@@ -56,7 +54,88 @@ def get_evidence_service(db: Session = Depends(get_db)) -> EvidenceService:
# ============================================================================
# Evidence CRUD
# Anti-Fake-Evidence: Four-Eyes Domain Check
# ============================================================================
FOUR_EYES_DOMAINS = {"gov", "priv"}
def _requires_four_eyes(control_domain: str) -> bool:
"""Controls in governance/privacy domains require two independent reviewers."""
return control_domain in FOUR_EYES_DOMAINS
# ============================================================================
# Anti-Fake-Evidence: Auto-Classification Helpers
# ============================================================================
def _classify_confidence(source: Optional[str], evidence_type: Optional[str] = None, artifact_hash: Optional[str] = None) -> EvidenceConfidenceEnum:
"""Classify evidence confidence level based on source and metadata."""
if source == "ci_pipeline":
return EvidenceConfidenceEnum.E3
if source == "api" and artifact_hash:
return EvidenceConfidenceEnum.E3
if source == "api":
return EvidenceConfidenceEnum.E3
if source in ("manual", "upload"):
return EvidenceConfidenceEnum.E1
if source == "generated":
return EvidenceConfidenceEnum.E0
# Default for unknown sources
return EvidenceConfidenceEnum.E1
def _classify_truth_status(source: Optional[str]) -> EvidenceTruthStatusEnum:
"""Classify evidence truth status based on source."""
if source == "ci_pipeline":
return EvidenceTruthStatusEnum.OBSERVED
if source in ("manual", "upload"):
return EvidenceTruthStatusEnum.UPLOADED
if source == "generated":
return EvidenceTruthStatusEnum.GENERATED
if source == "api":
return EvidenceTruthStatusEnum.OBSERVED
return EvidenceTruthStatusEnum.UPLOADED
def _build_evidence_response(e: EvidenceDB) -> EvidenceResponse:
"""Build an EvidenceResponse from an EvidenceDB, including anti-fake fields."""
return EvidenceResponse(
id=e.id,
control_id=e.control_id,
evidence_type=e.evidence_type,
title=e.title,
description=e.description,
artifact_path=e.artifact_path,
artifact_url=e.artifact_url,
artifact_hash=e.artifact_hash,
file_size_bytes=e.file_size_bytes,
mime_type=e.mime_type,
valid_from=e.valid_from,
valid_until=e.valid_until,
status=e.status.value if e.status else None,
source=e.source,
ci_job_id=e.ci_job_id,
uploaded_by=e.uploaded_by,
collected_at=e.collected_at,
created_at=e.created_at,
confidence_level=e.confidence_level.value if e.confidence_level else None,
truth_status=e.truth_status.value if e.truth_status else None,
generation_mode=e.generation_mode,
may_be_used_as_evidence=e.may_be_used_as_evidence,
reviewed_by=e.reviewed_by,
reviewed_at=e.reviewed_at,
approval_status=e.approval_status,
first_reviewer=e.first_reviewer,
first_reviewed_at=e.first_reviewed_at,
second_reviewer=e.second_reviewer,
second_reviewed_at=e.second_reviewed_at,
requires_four_eyes=e.requires_four_eyes,
)
# ============================================================================
# Evidence
# ============================================================================
@router.get("/evidence", response_model=EvidenceListResponse)
@@ -69,8 +148,38 @@ async def list_evidence(
service: EvidenceService = Depends(get_evidence_service),
) -> EvidenceListResponse:
"""List evidence with optional filters and pagination."""
with translate_domain_errors():
return service.list_evidence(control_id, evidence_type, status, page, limit)
repo = EvidenceRepository(db)
if control_id:
# First get the control UUID
ctrl_repo = ControlRepository(db)
control = ctrl_repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
evidence = repo.get_by_control(control.id)
else:
evidence = repo.get_all()
if evidence_type:
evidence = [e for e in evidence if e.evidence_type == evidence_type]
if status:
try:
status_enum = EvidenceStatusEnum(status)
evidence = [e for e in evidence if e.status == status_enum]
except ValueError:
pass
total = len(evidence)
# Apply pagination if requested
if page is not None and limit is not None:
offset = (page - 1) * limit
evidence = evidence[offset:offset + limit]
results = [_build_evidence_response(e) for e in evidence]
return EvidenceListResponse(evidence=results, total=total)
@router.post("/evidence", response_model=EvidenceResponse)
@@ -79,8 +188,66 @@ async def create_evidence(
service: EvidenceService = Depends(get_evidence_service),
) -> EvidenceResponse:
"""Create new evidence record."""
with translate_domain_errors():
return service.create_evidence(evidence_data)
repo = EvidenceRepository(db)
# Get control UUID
ctrl_repo = ControlRepository(db)
control = ctrl_repo.get_by_control_id(evidence_data.control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {evidence_data.control_id} not found")
source = evidence_data.source or "api"
confidence = _classify_confidence(source, evidence_data.evidence_type)
truth = _classify_truth_status(source)
# Allow explicit override from request
if evidence_data.confidence_level:
try:
confidence = EvidenceConfidenceEnum(evidence_data.confidence_level)
except ValueError:
pass
if evidence_data.truth_status:
try:
truth = EvidenceTruthStatusEnum(evidence_data.truth_status)
except ValueError:
pass
evidence = repo.create(
control_id=control.id,
evidence_type=evidence_data.evidence_type,
title=evidence_data.title,
description=evidence_data.description,
artifact_url=evidence_data.artifact_url,
valid_from=evidence_data.valid_from,
valid_until=evidence_data.valid_until,
source=source,
ci_job_id=evidence_data.ci_job_id,
)
# Set anti-fake-evidence fields
evidence.confidence_level = confidence
evidence.truth_status = truth
# Generated evidence should not be used as evidence by default
if truth == EvidenceTruthStatusEnum.GENERATED:
evidence.may_be_used_as_evidence = False
# Four-Eyes: check if the linked control's domain requires it
control_domain = control.domain.value if control.domain else ""
if _requires_four_eyes(control_domain):
evidence.requires_four_eyes = True
evidence.approval_status = "pending_first"
db.commit()
# Audit trail
log_audit_trail(
db, "evidence", evidence.id, evidence.title, "create",
performed_by=evidence_data.source or "api",
change_summary=f"Evidence created with confidence={confidence.value}, truth={truth.value}",
)
db.commit()
return _build_evidence_response(evidence)
@router.delete("/evidence/{evidence_id}")
@@ -107,9 +274,271 @@ async def upload_evidence(
service: EvidenceService = Depends(get_evidence_service),
) -> EvidenceResponse:
"""Upload evidence file."""
with translate_domain_errors():
return await service.upload_evidence(
control_id, evidence_type, title, file, description
# Get control UUID
ctrl_repo = ControlRepository(db)
control = ctrl_repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
# Create upload directory
upload_dir = f"/tmp/compliance_evidence/{control_id}"
os.makedirs(upload_dir, exist_ok=True)
# Save file
file_path = os.path.join(upload_dir, file.filename)
content = await file.read()
with open(file_path, "wb") as f:
f.write(content)
# Calculate hash
file_hash = hashlib.sha256(content).hexdigest()
# Create evidence record
repo = EvidenceRepository(db)
evidence = repo.create(
control_id=control.id,
evidence_type=evidence_type,
title=title,
description=description,
artifact_path=file_path,
artifact_hash=file_hash,
file_size_bytes=len(content),
mime_type=file.content_type,
source="upload",
)
# Upload evidence → E1 + uploaded
evidence.confidence_level = EvidenceConfidenceEnum.E1
evidence.truth_status = EvidenceTruthStatusEnum.UPLOADED
# Four-Eyes: check if the linked control's domain requires it
control_domain = control.domain.value if control.domain else ""
if _requires_four_eyes(control_domain):
evidence.requires_four_eyes = True
evidence.approval_status = "pending_first"
db.commit()
return _build_evidence_response(evidence)
# ============================================================================
# CI/CD Evidence Collection — helpers
# ============================================================================
# Map CI source names to the corresponding control IDs
SOURCE_CONTROL_MAP = {
"sast": "SDLC-001",
"dependency_scan": "SDLC-002",
"secret_scan": "SDLC-003",
"code_review": "SDLC-004",
"sbom": "SDLC-005",
"container_scan": "SDLC-006",
"test_results": "AUD-001",
}
def _parse_ci_evidence(data: dict) -> dict:
"""
Parse and validate incoming CI evidence data.
Returns a dict with:
- report_json: str (serialised JSON)
- report_hash: str (SHA-256 hex digest)
- evidence_status: str ("valid" or "failed")
- findings_count: int
- critical_findings: int
"""
report_json = json.dumps(data) if data else "{}"
report_hash = hashlib.sha256(report_json.encode()).hexdigest()
findings_count = 0
critical_findings = 0
if data and isinstance(data, dict):
# Semgrep format
if "results" in data:
findings_count = len(data.get("results", []))
critical_findings = len([
r for r in data.get("results", [])
if r.get("extra", {}).get("severity", "").upper() in ["CRITICAL", "HIGH"]
])
# Trivy format
elif "Results" in data:
for result in data.get("Results", []):
vulns = result.get("Vulnerabilities", [])
findings_count += len(vulns)
critical_findings += len([
v for v in vulns
if v.get("Severity", "").upper() in ["CRITICAL", "HIGH"]
])
# Generic findings array
elif "findings" in data:
findings_count = len(data.get("findings", []))
# SBOM format - just count components
elif "components" in data:
findings_count = len(data.get("components", []))
evidence_status = "failed" if critical_findings > 0 else "valid"
return {
"report_json": report_json,
"report_hash": report_hash,
"evidence_status": evidence_status,
"findings_count": findings_count,
"critical_findings": critical_findings,
}
def _store_evidence(
db: Session,
*,
control_db_id: str,
source: str,
parsed: dict,
ci_job_id: str,
ci_job_url: str,
report_data: dict,
) -> EvidenceDB:
"""
Persist a CI evidence item to the database and write the report file.
Returns the created EvidenceDB instance (already committed).
"""
findings_count = parsed["findings_count"]
critical_findings = parsed["critical_findings"]
# Build title and description
title = f"{source.upper()} Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}"
description = "Automatically collected from CI/CD pipeline"
if findings_count > 0:
description += f"\n- Total findings: {findings_count}"
if critical_findings > 0:
description += f"\n- Critical/High findings: {critical_findings}"
if ci_job_id:
description += f"\n- CI Job ID: {ci_job_id}"
if ci_job_url:
description += f"\n- CI Job URL: {ci_job_url}"
# Store report file
upload_dir = f"/tmp/compliance_evidence/ci/{source}"
os.makedirs(upload_dir, exist_ok=True)
file_name = f"{source}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{parsed['report_hash'][:8]}.json"
file_path = os.path.join(upload_dir, file_name)
with open(file_path, "w") as f:
json.dump(report_data or {}, f, indent=2)
# Create evidence record with anti-fake-evidence classification
evidence = EvidenceDB(
id=str(uuid_module.uuid4()),
control_id=control_db_id,
evidence_type=f"ci_{source}",
title=title,
description=description,
artifact_path=file_path,
artifact_hash=parsed["report_hash"],
file_size_bytes=len(parsed["report_json"]),
mime_type="application/json",
source="ci_pipeline",
ci_job_id=ci_job_id,
valid_from=datetime.utcnow(),
valid_until=datetime.utcnow() + timedelta(days=90),
status=EvidenceStatusEnum(parsed["evidence_status"]),
# CI pipeline evidence → E3 observed (system-observed, hash-verified)
confidence_level=EvidenceConfidenceEnum.E3,
truth_status=EvidenceTruthStatusEnum.OBSERVED,
may_be_used_as_evidence=True,
)
db.add(evidence)
db.commit()
db.refresh(evidence)
return evidence
def _extract_findings_detail(report_data: dict) -> dict:
"""
Extract severity-bucketed finding counts from report data.
Returns dict with keys: critical, high, medium, low.
"""
findings_detail = {
"critical": 0,
"high": 0,
"medium": 0,
"low": 0,
}
if not report_data:
return findings_detail
# Semgrep format
if "results" in report_data:
for r in report_data.get("results", []):
severity = r.get("extra", {}).get("severity", "").upper()
if severity == "CRITICAL":
findings_detail["critical"] += 1
elif severity == "HIGH":
findings_detail["high"] += 1
elif severity == "MEDIUM":
findings_detail["medium"] += 1
elif severity in ["LOW", "INFO"]:
findings_detail["low"] += 1
# Trivy format
elif "Results" in report_data:
for result in report_data.get("Results", []):
for v in result.get("Vulnerabilities", []):
severity = v.get("Severity", "").upper()
if severity == "CRITICAL":
findings_detail["critical"] += 1
elif severity == "HIGH":
findings_detail["high"] += 1
elif severity == "MEDIUM":
findings_detail["medium"] += 1
elif severity == "LOW":
findings_detail["low"] += 1
# Generic findings with severity
elif "findings" in report_data:
for f in report_data.get("findings", []):
severity = f.get("severity", "").upper()
if severity == "CRITICAL":
findings_detail["critical"] += 1
elif severity == "HIGH":
findings_detail["high"] += 1
elif severity == "MEDIUM":
findings_detail["medium"] += 1
else:
findings_detail["low"] += 1
return findings_detail
def _update_risks(db: Session, *, source: str, control_id: str, ci_job_id: str, report_data: dict):
"""
Update risk status based on new evidence.
Uses AutoRiskUpdater to update Control status and linked Risks based on
severity-bucketed findings. Returns the update result or None on error.
"""
findings_detail = _extract_findings_detail(report_data)
try:
auto_updater = AutoRiskUpdater(db)
risk_update_result = auto_updater.process_evidence_collect_request(
tool=source,
control_id=control_id,
evidence_type=f"ci_{source}",
timestamp=datetime.utcnow().isoformat(),
commit_sha=report_data.get("commit_sha", "unknown") if report_data else "unknown",
ci_job_id=ci_job_id,
findings=findings_detail,
)
@@ -227,14 +656,229 @@ async def get_ci_evidence_status(
# Legacy re-exports for tests that import helpers directly.
# ----------------------------------------------------------------------------
__all__ = [
"router",
"SOURCE_CONTROL_MAP",
"EvidenceRepository",
"ControlRepository",
"AutoRiskUpdater",
"_parse_ci_evidence",
"_extract_findings_detail",
"_store_evidence",
"_update_risks",
]
if control_id:
ctrl_repo = ControlRepository(db)
control = ctrl_repo.get_by_control_id(control_id)
if control:
query = query.filter(EvidenceDB.control_id == control.id)
evidence_list = query.order_by(EvidenceDB.collected_at.desc()).limit(100).all()
# Group by control and calculate stats
control_stats = defaultdict(lambda: {
"total": 0,
"valid": 0,
"failed": 0,
"last_collected": None,
"evidence": [],
})
for e in evidence_list:
# Get control_id string
control = db.query(ControlDB).filter(ControlDB.id == e.control_id).first()
ctrl_id = control.control_id if control else "unknown"
stats = control_stats[ctrl_id]
stats["total"] += 1
if e.status:
if e.status.value == "valid":
stats["valid"] += 1
elif e.status.value == "failed":
stats["failed"] += 1
if not stats["last_collected"] or e.collected_at > stats["last_collected"]:
stats["last_collected"] = e.collected_at
# Add evidence summary
stats["evidence"].append({
"id": e.id,
"type": e.evidence_type,
"status": e.status.value if e.status else None,
"collected_at": e.collected_at.isoformat() if e.collected_at else None,
"ci_job_id": e.ci_job_id,
})
# Convert to list and sort
result = []
for ctrl_id, stats in control_stats.items():
result.append({
"control_id": ctrl_id,
"total_evidence": stats["total"],
"valid_count": stats["valid"],
"failed_count": stats["failed"],
"last_collected": stats["last_collected"].isoformat() if stats["last_collected"] else None,
"recent_evidence": stats["evidence"][:5],
})
result.sort(key=lambda x: x["last_collected"] or "", reverse=True)
return {
"period_days": days,
"total_evidence": len(evidence_list),
"controls": result,
}
# ============================================================================
# Evidence Review (Anti-Fake-Evidence)
# ============================================================================
from pydantic import BaseModel as _BaseModel
class _EvidenceReviewRequest(_BaseModel):
confidence_level: Optional[str] = None
truth_status: Optional[str] = None
reviewed_by: str
@router.patch("/evidence/{evidence_id}/review", response_model=EvidenceResponse)
async def review_evidence(
evidence_id: str,
review: _EvidenceReviewRequest,
db: Session = Depends(get_db),
):
"""
Review evidence: upgrade confidence level and/or change truth status.
For Four-Eyes evidence, the first reviewer sets first_reviewer and
approval_status='first_approved'. A second (different) reviewer then
sets second_reviewer and approval_status='approved'.
"""
evidence = db.query(EvidenceDB).filter(EvidenceDB.id == evidence_id).first()
if not evidence:
raise HTTPException(status_code=404, detail=f"Evidence {evidence_id} not found")
old_confidence = evidence.confidence_level.value if evidence.confidence_level else None
old_truth = evidence.truth_status.value if evidence.truth_status else None
if review.confidence_level:
try:
evidence.confidence_level = EvidenceConfidenceEnum(review.confidence_level)
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid confidence_level: {review.confidence_level}")
if review.truth_status:
try:
evidence.truth_status = EvidenceTruthStatusEnum(review.truth_status)
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid truth_status: {review.truth_status}")
# Four-Eyes branching
if evidence.requires_four_eyes:
status = evidence.approval_status or "none"
if status in ("none", "pending_first"):
evidence.first_reviewer = review.reviewed_by
evidence.first_reviewed_at = datetime.utcnow()
evidence.approval_status = "first_approved"
elif status == "first_approved":
if review.reviewed_by == evidence.first_reviewer:
raise HTTPException(
status_code=400,
detail="Four-Eyes: second reviewer must be different from first reviewer",
)
evidence.second_reviewer = review.reviewed_by
evidence.second_reviewed_at = datetime.utcnow()
evidence.approval_status = "approved"
elif status == "approved":
raise HTTPException(status_code=400, detail="Evidence already approved")
elif status == "rejected":
raise HTTPException(status_code=400, detail="Evidence was rejected — create new evidence instead")
evidence.reviewed_by = review.reviewed_by
evidence.reviewed_at = datetime.utcnow()
db.commit()
# Audit trail
new_confidence = evidence.confidence_level.value if evidence.confidence_level else None
if old_confidence != new_confidence:
log_audit_trail(
db, "evidence", evidence_id, evidence.title, "review",
performed_by=review.reviewed_by,
field_changed="confidence_level",
old_value=old_confidence,
new_value=new_confidence,
)
new_truth = evidence.truth_status.value if evidence.truth_status else None
if old_truth != new_truth:
log_audit_trail(
db, "evidence", evidence_id, evidence.title, "review",
performed_by=review.reviewed_by,
field_changed="truth_status",
old_value=old_truth,
new_value=new_truth,
)
db.commit()
db.refresh(evidence)
return _build_evidence_response(evidence)
@router.patch("/evidence/{evidence_id}/reject", response_model=EvidenceResponse)
async def reject_evidence(
evidence_id: str,
body: EvidenceRejectRequest,
db: Session = Depends(get_db),
):
"""Reject evidence (sets approval_status='rejected')."""
evidence = db.query(EvidenceDB).filter(EvidenceDB.id == evidence_id).first()
if not evidence:
raise HTTPException(status_code=404, detail=f"Evidence {evidence_id} not found")
evidence.approval_status = "rejected"
evidence.reviewed_by = body.reviewed_by
evidence.reviewed_at = datetime.utcnow()
db.commit()
log_audit_trail(
db, "evidence", evidence_id, evidence.title, "reject",
performed_by=body.reviewed_by,
change_summary=body.rejection_reason or "Evidence rejected",
)
db.commit()
db.refresh(evidence)
return _build_evidence_response(evidence)
# ============================================================================
# Audit Trail Query
# ============================================================================
@router.get("/audit-trail")
async def get_audit_trail(
entity_type: Optional[str] = Query(None),
entity_id: Optional[str] = Query(None),
action: Optional[str] = Query(None),
limit: int = Query(50, ge=1, le=200),
db: Session = Depends(get_db),
):
"""Query audit trail entries for an entity."""
query = db.query(AuditTrailDB)
if entity_type:
query = query.filter(AuditTrailDB.entity_type == entity_type)
if entity_id:
query = query.filter(AuditTrailDB.entity_id == entity_id)
if action:
query = query.filter(AuditTrailDB.action == action)
records = query.order_by(AuditTrailDB.performed_at.desc()).limit(limit).all()
return {
"entries": [
{
"id": r.id,
"entity_type": r.entity_type,
"entity_id": r.entity_id,
"entity_name": r.entity_name,
"action": r.action,
"field_changed": r.field_changed,
"old_value": r.old_value,
"new_value": r.new_value,
"change_summary": r.change_summary,
"performed_by": r.performed_by,
"performed_at": r.performed_at.isoformat() if r.performed_at else None,
"checksum": r.checksum,
}
for r in records
],
"total": len(records),
}
@@ -39,7 +39,6 @@ router = APIRouter(tags=["extraction"])
ALL_COLLECTIONS = [
"bp_compliance_ce", # BSI-TR documents — primary Prüfaspekte source
"bp_compliance_recht", # Legal texts (GDPR, AI Act, ...)
"bp_compliance_gesetze", # German laws
"bp_compliance_datenschutz", # Data protection documents
"bp_dsfa_corpus", # DSFA corpus
@@ -80,9 +80,13 @@ def _handle(func, *args, **kwargs): # type: ignore[no-untyped-def]
raise HTTPException(status_code=400, detail=str(exc))
# ============================================================================
# ISMS Scope (ISO 27001 4.3)
# ============================================================================
# Shared audit trail utilities — canonical implementation in audit_trail_utils.py
from .audit_trail_utils import log_audit_trail, create_signature # noqa: E402
# =============================================================================
# ISMS SCOPE (ISO 27001 4.3)
# =============================================================================
@router.get("/scope", response_model=ISMSScopeResponse)
async def get_isms_scope(db: Session = Depends(get_db)):
@@ -50,6 +50,57 @@ VALID_DOCUMENT_TYPES = {
"cookie_banner",
"agb",
"clause",
# Security document templates (Migration 051)
"it_security_concept",
"data_protection_concept",
"backup_recovery_concept",
"logging_concept",
"incident_response_plan",
"access_control_concept",
"risk_management_concept",
# Policy templates — IT Security (Migration 054)
"information_security_policy",
"access_control_policy",
"password_policy",
"encryption_policy",
"logging_policy",
"backup_policy",
"incident_response_policy",
"change_management_policy",
"patch_management_policy",
"asset_management_policy",
"cloud_security_policy",
"devsecops_policy",
"secrets_management_policy",
"vulnerability_management_policy",
# Policy templates — Data (Migration 054)
"data_protection_policy",
"data_classification_policy",
"data_retention_policy",
"data_transfer_policy",
"privacy_incident_policy",
# Policy templates — Personnel (Migration 054)
"employee_security_policy",
"security_awareness_policy",
"remote_work_policy",
"offboarding_policy",
# Policy templates — Vendor/Supply Chain (Migration 054)
"vendor_risk_management_policy",
"third_party_security_policy",
"supplier_security_policy",
# Policy templates — BCM (Migration 054)
"business_continuity_policy",
"disaster_recovery_policy",
"crisis_management_policy",
# CRA Cybersecurity (Migration 056)
"cybersecurity_policy",
# DSFA template
"dsfa",
# Module document templates (Migration 073)
"vvt_register",
"tom_documentation",
"loeschkonzept",
"pflichtenregister",
}
VALID_STATUSES = {"published", "draft", "archived"}
@@ -0,0 +1,162 @@
"""
FastAPI routes for LLM Generation Audit Trail.
Endpoints:
- POST /llm-audit: Record an LLM generation event
- GET /llm-audit: List audit records with filters
"""
import logging
import uuid as uuid_module
from datetime import datetime
from typing import Optional
from fastapi import APIRouter, Depends, Query
from pydantic import BaseModel
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from ..db.models import LLMGenerationAuditDB
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-llm-audit"])
# ============================================================================
# Schemas
# ============================================================================
class LLMAuditCreate(BaseModel):
entity_type: str
entity_id: Optional[str] = None
generation_mode: str
truth_status: str = "generated"
may_be_used_as_evidence: bool = False
llm_model: Optional[str] = None
llm_provider: Optional[str] = None
prompt_hash: Optional[str] = None
input_summary: Optional[str] = None
output_summary: Optional[str] = None
metadata: Optional[dict] = None
tenant_id: Optional[str] = None
class LLMAuditResponse(BaseModel):
id: str
tenant_id: Optional[str] = None
entity_type: str
entity_id: Optional[str] = None
generation_mode: str
truth_status: str
may_be_used_as_evidence: bool
llm_model: Optional[str] = None
llm_provider: Optional[str] = None
prompt_hash: Optional[str] = None
input_summary: Optional[str] = None
output_summary: Optional[str] = None
metadata: Optional[dict] = None
created_at: datetime
class Config:
from_attributes = True
# ============================================================================
# Routes
# ============================================================================
@router.post("/llm-audit", response_model=LLMAuditResponse)
async def create_llm_audit(
data: LLMAuditCreate,
db: Session = Depends(get_db),
):
"""Record an LLM generation event for audit trail."""
from ..db.models import EvidenceTruthStatusEnum
# Validate truth_status
try:
truth_enum = EvidenceTruthStatusEnum(data.truth_status)
except ValueError:
truth_enum = EvidenceTruthStatusEnum.GENERATED
record = LLMGenerationAuditDB(
id=str(uuid_module.uuid4()),
tenant_id=data.tenant_id,
entity_type=data.entity_type,
entity_id=data.entity_id,
generation_mode=data.generation_mode,
truth_status=truth_enum,
may_be_used_as_evidence=data.may_be_used_as_evidence,
llm_model=data.llm_model,
llm_provider=data.llm_provider,
prompt_hash=data.prompt_hash,
input_summary=data.input_summary[:500] if data.input_summary else None,
output_summary=data.output_summary[:500] if data.output_summary else None,
extra_metadata=data.metadata or {},
)
db.add(record)
db.commit()
db.refresh(record)
return LLMAuditResponse(
id=record.id,
tenant_id=record.tenant_id,
entity_type=record.entity_type,
entity_id=record.entity_id,
generation_mode=record.generation_mode,
truth_status=record.truth_status.value if record.truth_status else "generated",
may_be_used_as_evidence=record.may_be_used_as_evidence,
llm_model=record.llm_model,
llm_provider=record.llm_provider,
prompt_hash=record.prompt_hash,
input_summary=record.input_summary,
output_summary=record.output_summary,
metadata=record.extra_metadata,
created_at=record.created_at,
)
@router.get("/llm-audit")
async def list_llm_audit(
entity_type: Optional[str] = Query(None),
entity_id: Optional[str] = Query(None),
page: int = Query(1, ge=1),
limit: int = Query(50, ge=1, le=200),
db: Session = Depends(get_db),
):
"""List LLM generation audit records with optional filters."""
query = db.query(LLMGenerationAuditDB)
if entity_type:
query = query.filter(LLMGenerationAuditDB.entity_type == entity_type)
if entity_id:
query = query.filter(LLMGenerationAuditDB.entity_id == entity_id)
total = query.count()
offset = (page - 1) * limit
records = query.order_by(LLMGenerationAuditDB.created_at.desc()).offset(offset).limit(limit).all()
return {
"records": [
LLMAuditResponse(
id=r.id,
tenant_id=r.tenant_id,
entity_type=r.entity_type,
entity_id=r.entity_id,
generation_mode=r.generation_mode,
truth_status=r.truth_status.value if r.truth_status else "generated",
may_be_used_as_evidence=r.may_be_used_as_evidence,
llm_model=r.llm_model,
llm_provider=r.llm_provider,
prompt_hash=r.prompt_hash,
input_summary=r.input_summary,
output_summary=r.output_summary,
metadata=r.extra_metadata,
created_at=r.created_at,
)
for r in records
],
"total": total,
"page": page,
"limit": limit,
}
@@ -56,6 +56,7 @@ class LoeschfristCreate(BaseModel):
responsible_person: Optional[str] = None
release_process: Optional[str] = None
linked_vvt_activity_ids: Optional[List[Any]] = None
linked_vendor_ids: Optional[List[Any]] = None
status: str = "DRAFT"
last_review_date: Optional[datetime] = None
next_review_date: Optional[datetime] = None
@@ -86,6 +87,7 @@ class LoeschfristUpdate(BaseModel):
responsible_person: Optional[str] = None
release_process: Optional[str] = None
linked_vvt_activity_ids: Optional[List[Any]] = None
linked_vendor_ids: Optional[List[Any]] = None
status: Optional[str] = None
last_review_date: Optional[datetime] = None
next_review_date: Optional[datetime] = None
@@ -100,7 +102,7 @@ class StatusUpdate(BaseModel):
# JSONB fields that need CAST
JSONB_FIELDS = {
"affected_groups", "data_categories", "legal_holds",
"storage_locations", "linked_vvt_activity_ids", "tags"
"storage_locations", "linked_vvt_activity_ids", "linked_vendor_ids", "tags"
}
@@ -42,6 +42,7 @@ class ObligationCreate(BaseModel):
priority: str = "medium"
responsible: Optional[str] = None
linked_systems: Optional[List[str]] = None
linked_vendor_ids: Optional[List[str]] = None
assessment_id: Optional[str] = None
rule_code: Optional[str] = None
notes: Optional[str] = None
@@ -57,6 +58,7 @@ class ObligationUpdate(BaseModel):
priority: Optional[str] = None
responsible: Optional[str] = None
linked_systems: Optional[List[str]] = None
linked_vendor_ids: Optional[List[str]] = None
notes: Optional[str] = None
@@ -173,14 +175,15 @@ async def create_obligation(
import json
linked_systems = json.dumps(payload.linked_systems or [])
linked_vendor_ids = json.dumps(payload.linked_vendor_ids or [])
row = db.execute(text("""
INSERT INTO compliance_obligations
(tenant_id, title, description, source, source_article, deadline,
status, priority, responsible, linked_systems, assessment_id, rule_code, notes)
status, priority, responsible, linked_systems, linked_vendor_ids, assessment_id, rule_code, notes)
VALUES
(:tenant_id, :title, :description, :source, :source_article, :deadline,
:status, :priority, :responsible, CAST(:linked_systems AS jsonb), :assessment_id, :rule_code, :notes)
:status, :priority, :responsible, CAST(:linked_systems AS jsonb), CAST(:linked_vendor_ids AS jsonb), :assessment_id, :rule_code, :notes)
RETURNING *
"""), {
"tenant_id": tenant_id,
@@ -193,6 +196,7 @@ async def create_obligation(
"priority": payload.priority,
"responsible": payload.responsible,
"linked_systems": linked_systems,
"linked_vendor_ids": linked_vendor_ids,
"assessment_id": payload.assessment_id,
"rule_code": payload.rule_code,
"notes": payload.notes,
@@ -235,6 +239,9 @@ async def update_obligation(
if field == "linked_systems":
updates["linked_systems"] = json.dumps(value or [])
set_clauses.append("linked_systems = CAST(:linked_systems AS jsonb)")
elif field == "linked_vendor_ids":
updates["linked_vendor_ids"] = json.dumps(value or [])
set_clauses.append("linked_vendor_ids = CAST(:linked_vendor_ids AS jsonb)")
else:
updates[field] = value
set_clauses.append(f"{field} = :{field}")
File diff suppressed because it is too large Load Diff
+148 -6
View File
@@ -25,6 +25,7 @@ from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from .audit_trail_utils import log_audit_trail
from ..db import (
ControlDomainEnum,
ControlRepository,
@@ -312,8 +313,39 @@ async def get_control(
svc: ControlExportService = Depends(get_ctrl_export_service),
) -> ControlResponse:
"""Get a specific control by control_id."""
with translate_domain_errors():
return svc.get_control(control_id)
repo = ControlRepository(db)
control = repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
evidence_repo = EvidenceRepository(db)
evidence = evidence_repo.get_by_control(control.id)
return ControlResponse(
id=control.id,
control_id=control.control_id,
domain=control.domain.value if control.domain else None,
control_type=control.control_type.value if control.control_type else None,
title=control.title,
description=control.description,
pass_criteria=control.pass_criteria,
implementation_guidance=control.implementation_guidance,
code_reference=control.code_reference,
documentation_url=control.documentation_url,
is_automated=control.is_automated,
automation_tool=control.automation_tool,
automation_config=control.automation_config,
owner=control.owner,
review_frequency_days=control.review_frequency_days,
status=control.status.value if control.status else None,
status_notes=control.status_notes,
status_justification=control.status_justification,
last_reviewed_at=control.last_reviewed_at,
next_review_at=control.next_review_at,
created_at=control.created_at,
updated_at=control.updated_at,
evidence_count=len(evidence),
)
@router.put(
@@ -325,8 +357,83 @@ async def update_control(
svc: ControlExportService = Depends(get_ctrl_export_service),
) -> ControlResponse:
"""Update a control."""
with translate_domain_errors():
return svc.update_control(control_id, update)
repo = ControlRepository(db)
control = repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
update_data = update.model_dump(exclude_unset=True)
# Convert status string to enum and validate transition
if "status" in update_data:
try:
new_status_enum = ControlStatusEnum(update_data["status"])
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid status: {update_data['status']}")
# Validate status transition (Anti-Fake-Evidence)
from ..services.control_status_machine import validate_transition
current_status = control.status.value if control.status else "planned"
evidence_list = db.query(EvidenceDB).filter(EvidenceDB.control_id == control.id).all()
allowed, violations = validate_transition(
current_status=current_status,
new_status=update_data["status"],
evidence_list=evidence_list,
status_justification=update_data.get("status_justification") or update_data.get("status_notes"),
)
if not allowed:
raise HTTPException(
status_code=409,
detail={
"error": "Status transition not allowed",
"current_status": current_status,
"requested_status": update_data["status"],
"violations": violations,
}
)
update_data["status"] = new_status_enum
updated = repo.update(control.id, **update_data)
db.commit()
# Audit trail for status changes
new_status = updated.status.value if updated.status else None
if "status" in update.model_dump(exclude_unset=True) and current_status != new_status:
log_audit_trail(
db, "control", control.id, updated.control_id or updated.title,
"status_change",
performed_by=update.owner or "system",
field_changed="status",
old_value=current_status,
new_value=new_status,
)
db.commit()
return ControlResponse(
id=updated.id,
control_id=updated.control_id,
domain=updated.domain.value if updated.domain else None,
control_type=updated.control_type.value if updated.control_type else None,
title=updated.title,
description=updated.description,
pass_criteria=updated.pass_criteria,
implementation_guidance=updated.implementation_guidance,
code_reference=updated.code_reference,
documentation_url=updated.documentation_url,
is_automated=updated.is_automated,
automation_tool=updated.automation_tool,
automation_config=updated.automation_config,
owner=updated.owner,
review_frequency_days=updated.review_frequency_days,
status=updated.status.value if updated.status else None,
status_notes=updated.status_notes,
status_justification=updated.status_justification,
last_reviewed_at=updated.last_reviewed_at,
next_review_at=updated.next_review_at,
created_at=updated.created_at,
updated_at=updated.updated_at,
)
@router.put(
@@ -339,8 +446,43 @@ async def review_control(
svc: ControlExportService = Depends(get_ctrl_export_service),
) -> ControlResponse:
"""Mark a control as reviewed with new status."""
with translate_domain_errors():
return svc.review_control(control_id, review)
repo = ControlRepository(db)
control = repo.get_by_control_id(control_id)
if not control:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
try:
status_enum = ControlStatusEnum(review.status)
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid status: {review.status}")
updated = repo.mark_reviewed(control.id, status_enum, review.status_notes)
db.commit()
return ControlResponse(
id=updated.id,
control_id=updated.control_id,
domain=updated.domain.value if updated.domain else None,
control_type=updated.control_type.value if updated.control_type else None,
title=updated.title,
description=updated.description,
pass_criteria=updated.pass_criteria,
implementation_guidance=updated.implementation_guidance,
code_reference=updated.code_reference,
documentation_url=updated.documentation_url,
is_automated=updated.is_automated,
automation_tool=updated.automation_tool,
automation_config=updated.automation_config,
owner=updated.owner,
review_frequency_days=updated.review_frequency_days,
status=updated.status.value if updated.status else None,
status_notes=updated.status_notes,
status_justification=updated.status_justification,
last_reviewed_at=updated.last_reviewed_at,
next_review_at=updated.next_review_at,
created_at=updated.created_at,
updated_at=updated.updated_at,
)
@router.get(
File diff suppressed because it is too large Load Diff
@@ -22,7 +22,9 @@ import uuid
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
import httpx
from fastapi import APIRouter, File, Form, UploadFile, HTTPException
from pydantic import BaseModel
from sqlalchemy import text
from database import SessionLocal # re-exported below for legacy test patches
@@ -96,15 +98,13 @@ async def scan_dependencies(
db = SessionLocal()
try:
db.execute(
text(
"INSERT INTO compliance_screenings "
"(id, tenant_id, status, sbom_format, sbom_version, "
"total_components, total_issues, critical_issues, high_issues, "
"medium_issues, low_issues, sbom_data, started_at, completed_at) "
"VALUES (:id, :tenant_id, 'completed', 'CycloneDX', '1.5', "
":total_components, :total_issues, :critical, :high, :medium, :low, "
":sbom_data::jsonb, :started_at, :completed_at)"
),
text("""INSERT INTO compliance_screenings
(id, tenant_id, status, sbom_format, sbom_version,
total_components, total_issues, critical_issues, high_issues, medium_issues, low_issues,
sbom_data, started_at, completed_at)
VALUES (:id, :tenant_id, 'completed', 'CycloneDX', '1.5',
:total_components, :total_issues, :critical, :high, :medium, :low,
:sbom_data::jsonb, :started_at, :completed_at)"""),
{
"id": screening_id,
"tenant_id": tenant_id,
@@ -121,13 +121,11 @@ async def scan_dependencies(
)
for issue in issues:
db.execute(
text(
"INSERT INTO compliance_security_issues "
"(id, screening_id, severity, title, description, cve, cvss, "
"affected_component, affected_version, fixed_in, remediation, status) "
"VALUES (:id, :screening_id, :severity, :title, :description, :cve, :cvss, "
":component, :version, :fixed_in, :remediation, :status)"
),
text("""INSERT INTO compliance_security_issues
(id, screening_id, severity, title, description, cve, cvss,
affected_component, affected_version, fixed_in, remediation, status)
VALUES (:id, :screening_id, :severity, :title, :description, :cve, :cvss,
:component, :version, :fixed_in, :remediation, :status)"""),
{
"id": issue["id"],
"screening_id": screening_id,
@@ -214,8 +212,77 @@ async def get_screening(screening_id: str) -> ScreeningResponse:
"""Get a screening result by ID."""
db = SessionLocal()
try:
with translate_domain_errors():
return ScreeningService(db).get_screening(screening_id)
result = db.execute(
text("""SELECT id, status, sbom_format, sbom_version,
total_components, total_issues, critical_issues, high_issues,
medium_issues, low_issues, sbom_data, started_at, completed_at
FROM compliance_screenings WHERE id = :id"""),
{"id": screening_id},
)
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Screening not found")
# Fetch issues
issues_result = db.execute(
text("""SELECT id, severity, title, description, cve, cvss,
affected_component, affected_version, fixed_in, remediation, status
FROM compliance_security_issues WHERE screening_id = :id"""),
{"id": screening_id},
)
issues_rows = issues_result.fetchall()
issues = [
SecurityIssueResponse(
id=str(r[0]), severity=r[1], title=r[2], description=r[3],
cve=r[4], cvss=r[5], affected_component=r[6],
affected_version=r[7], fixed_in=r[8], remediation=r[9], status=r[10],
)
for r in issues_rows
]
# Reconstruct components from SBOM data
sbom_data = row[10] or {}
components = []
comp_vulns: dict[str, list[dict]] = {}
for issue in issues:
if issue.affected_component not in comp_vulns:
comp_vulns[issue.affected_component] = []
comp_vulns[issue.affected_component].append({
"id": issue.cve or issue.id,
"cve": issue.cve,
"severity": issue.severity,
"title": issue.title,
"cvss": issue.cvss,
"fixedIn": issue.fixed_in,
})
for sc in sbom_data.get("components", []):
components.append(SBOMComponentResponse(
name=sc["name"],
version=sc["version"],
type=sc.get("type", "library"),
purl=sc.get("purl", ""),
licenses=sc.get("licenses", []),
vulnerabilities=comp_vulns.get(sc["name"], []),
))
return ScreeningResponse(
id=str(row[0]),
status=row[1],
sbom_format=row[2] or "CycloneDX",
sbom_version=row[3] or "1.5",
total_components=row[4] or 0,
total_issues=row[5] or 0,
critical_issues=row[6] or 0,
high_issues=row[7] or 0,
medium_issues=row[8] or 0,
low_issues=row[9] or 0,
components=components,
issues=issues,
started_at=str(row[11]) if row[11] else None,
completed_at=str(row[12]) if row[12] else None,
)
finally:
db.close()
@@ -225,8 +292,33 @@ async def list_screenings(tenant_id: str = "default") -> ScreeningListResponse:
"""List all screenings for a tenant."""
db = SessionLocal()
try:
with translate_domain_errors():
return ScreeningService(db).list_screenings(tenant_id)
result = db.execute(
text("""SELECT id, status, total_components, total_issues,
critical_issues, high_issues, medium_issues, low_issues,
started_at, completed_at, created_at
FROM compliance_screenings
WHERE tenant_id = :tenant_id
ORDER BY created_at DESC"""),
{"tenant_id": tenant_id},
)
rows = result.fetchall()
screenings = [
{
"id": str(r[0]),
"status": r[1],
"total_components": r[2],
"total_issues": r[3],
"critical_issues": r[4],
"high_issues": r[5],
"medium_issues": r[6],
"low_issues": r[7],
"started_at": str(r[8]) if r[8] else None,
"completed_at": str(r[9]) if r[9] else None,
"created_at": str(r[10]),
}
for r in rows
]
return ScreeningListResponse(screenings=screenings, total=len(screenings))
finally:
db.close()
@@ -0,0 +1,537 @@
"""
TOM Canonical Control Mapping Routes.
Three-layer architecture:
TOM Measures (~88, audit-level) Mapping Bridge Canonical Controls (10,000+)
Endpoints:
POST /v1/tom-mappings/sync Sync canonical controls for company profile
GET /v1/tom-mappings List all mappings for tenant/project
GET /v1/tom-mappings/by-tom/{code} Mappings for a specific TOM control
GET /v1/tom-mappings/stats Coverage statistics
POST /v1/tom-mappings/manual Manually add a mapping
DELETE /v1/tom-mappings/{id} Remove a mapping
"""
from __future__ import annotations
import hashlib
import json
import logging
from typing import Any, Optional
from fastapi import APIRouter, HTTPException, Query, Header
from pydantic import BaseModel
from sqlalchemy import text
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/tom-mappings", tags=["tom-control-mappings"])
# =============================================================================
# TOM CATEGORY → CANONICAL CATEGORY MAPPING
# =============================================================================
# Maps 13 TOM control categories to canonical_control_categories
# Each TOM category maps to 1-3 canonical categories for broad coverage
TOM_TO_CANONICAL_CATEGORIES: dict[str, list[str]] = {
"ACCESS_CONTROL": ["authentication", "identity", "physical"],
"ADMISSION_CONTROL": ["authentication", "identity", "system"],
"ACCESS_AUTHORIZATION": ["authentication", "identity"],
"TRANSFER_CONTROL": ["network", "data_protection", "encryption"],
"INPUT_CONTROL": ["application", "data_protection"],
"ORDER_CONTROL": ["supply_chain", "compliance"],
"AVAILABILITY": ["continuity", "system"],
"SEPARATION": ["network", "data_protection"],
"ENCRYPTION": ["encryption"],
"PSEUDONYMIZATION": ["data_protection", "encryption"],
"RESILIENCE": ["continuity", "system"],
"RECOVERY": ["continuity"],
"REVIEW": ["compliance", "governance", "risk"],
}
# =============================================================================
# REQUEST / RESPONSE MODELS
# =============================================================================
class SyncRequest(BaseModel):
"""Trigger a sync of canonical controls to TOM measures."""
industry: Optional[str] = None
company_size: Optional[str] = None
force: bool = False
class ManualMappingRequest(BaseModel):
"""Manually add a canonical control to a TOM measure."""
tom_control_code: str
tom_category: str
canonical_control_id: str
canonical_control_code: str
canonical_category: Optional[str] = None
relevance_score: float = 1.0
# =============================================================================
# HELPERS
# =============================================================================
def _get_tenant_id(x_tenant_id: Optional[str]) -> str:
"""Extract tenant ID from header."""
if not x_tenant_id:
raise HTTPException(status_code=400, detail="X-Tenant-ID header required")
return x_tenant_id
def _compute_profile_hash(industry: Optional[str], company_size: Optional[str]) -> str:
"""Compute a hash from profile parameters for change detection."""
data = json.dumps({"industry": industry, "company_size": company_size}, sort_keys=True)
return hashlib.sha256(data.encode()).hexdigest()[:16]
def _mapping_row_to_dict(r) -> dict[str, Any]:
"""Convert a mapping row to API response dict."""
return {
"id": str(r.id),
"tenant_id": str(r.tenant_id),
"project_id": str(r.project_id) if r.project_id else None,
"tom_control_code": r.tom_control_code,
"tom_category": r.tom_category,
"canonical_control_id": str(r.canonical_control_id),
"canonical_control_code": r.canonical_control_code,
"canonical_category": r.canonical_category,
"mapping_type": r.mapping_type,
"relevance_score": float(r.relevance_score) if r.relevance_score else 1.0,
"created_at": r.created_at.isoformat() if r.created_at else None,
}
# =============================================================================
# SYNC ENDPOINT
# =============================================================================
@router.post("/sync")
async def sync_mappings(
body: SyncRequest,
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
):
"""
Sync canonical controls to TOM measures based on company profile.
Algorithm:
1. Compute profile hash skip if unchanged (unless force=True)
2. For each TOM category, find matching canonical controls by:
- Category mapping (TOM category canonical categories)
- Industry filter (applicable_industries JSONB containment)
- Company size filter (applicable_company_size JSONB containment)
- Only approved + customer_visible controls
3. Delete old auto-mappings, insert new ones
4. Update sync state
"""
tenant_id = _get_tenant_id(x_tenant_id)
profile_hash = _compute_profile_hash(body.industry, body.company_size)
with SessionLocal() as db:
# Check if sync is needed (profile unchanged)
if not body.force:
existing = db.execute(
text("""
SELECT profile_hash FROM tom_control_sync_state
WHERE tenant_id = :tid AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
"""),
{"tid": tenant_id, "pid": project_id},
).fetchone()
if existing and existing.profile_hash == profile_hash:
return {
"status": "unchanged",
"message": "Profile unchanged since last sync",
"profile_hash": profile_hash,
}
# Delete old auto-mappings for this tenant+project
db.execute(
text("""
DELETE FROM tom_control_mappings
WHERE tenant_id = :tid
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
AND mapping_type = 'auto'
"""),
{"tid": tenant_id, "pid": project_id},
)
total_mappings = 0
canonical_ids_matched = set()
tom_codes_covered = set()
# For each TOM category, find matching canonical controls
for tom_category, canonical_categories in TOM_TO_CANONICAL_CATEGORIES.items():
# Build JSONB containment query for categories
cat_conditions = " OR ".join(
f"category = :cat_{i}" for i in range(len(canonical_categories))
)
cat_params = {f"cat_{i}": c for i, c in enumerate(canonical_categories)}
# Build industry filter
industry_filter = ""
if body.industry:
industry_filter = """
AND (
applicable_industries IS NULL
OR applicable_industries @> '"all"'::jsonb
OR applicable_industries @> (:industry)::jsonb
)
"""
cat_params["industry"] = json.dumps([body.industry])
# Build company size filter
size_filter = ""
if body.company_size:
size_filter = """
AND (
applicable_company_size IS NULL
OR applicable_company_size @> '"all"'::jsonb
OR applicable_company_size @> (:csize)::jsonb
)
"""
cat_params["csize"] = json.dumps([body.company_size])
query = f"""
SELECT id, control_id, category
FROM canonical_controls
WHERE ({cat_conditions})
AND release_state = 'approved'
AND customer_visible = true
{industry_filter}
{size_filter}
ORDER BY control_id
"""
rows = db.execute(text(query), cat_params).fetchall()
# Find TOM control codes in this category (query the frontend library
# codes; we use the category prefix pattern from the loader)
# TOM codes follow pattern: TOM-XX-NN where XX is category abbreviation
# We insert one mapping per canonical control per TOM category
for row in rows:
db.execute(
text("""
INSERT INTO tom_control_mappings (
tenant_id, project_id, tom_control_code, tom_category,
canonical_control_id, canonical_control_code, canonical_category,
mapping_type, relevance_score
) VALUES (
:tid, :pid, :tom_cat, :tom_cat,
:cc_id, :cc_code, :cc_category,
'auto', 1.00
)
ON CONFLICT (tenant_id, project_id, tom_control_code, canonical_control_id)
DO NOTHING
"""),
{
"tid": tenant_id,
"pid": project_id,
"tom_cat": tom_category,
"cc_id": str(row.id),
"cc_code": row.control_id,
"cc_category": row.category,
},
)
total_mappings += 1
canonical_ids_matched.add(str(row.id))
tom_codes_covered.add(tom_category)
# Upsert sync state
db.execute(
text("""
INSERT INTO tom_control_sync_state (
tenant_id, project_id, profile_hash,
total_mappings, canonical_controls_matched, tom_controls_covered,
last_synced_at
) VALUES (
:tid, :pid, :hash,
:total, :matched, :covered,
NOW()
)
ON CONFLICT (tenant_id, project_id)
DO UPDATE SET
profile_hash = :hash,
total_mappings = :total,
canonical_controls_matched = :matched,
tom_controls_covered = :covered,
last_synced_at = NOW()
"""),
{
"tid": tenant_id,
"pid": project_id,
"hash": profile_hash,
"total": total_mappings,
"matched": len(canonical_ids_matched),
"covered": len(tom_codes_covered),
},
)
db.commit()
return {
"status": "synced",
"profile_hash": profile_hash,
"total_mappings": total_mappings,
"canonical_controls_matched": len(canonical_ids_matched),
"tom_categories_covered": len(tom_codes_covered),
}
# =============================================================================
# LIST MAPPINGS
# =============================================================================
@router.get("")
async def list_mappings(
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
tom_category: Optional[str] = Query(None),
mapping_type: Optional[str] = Query(None),
limit: int = Query(500, ge=1, le=5000),
offset: int = Query(0, ge=0),
):
"""List all TOM ↔ canonical control mappings for tenant/project."""
tenant_id = _get_tenant_id(x_tenant_id)
query = """
SELECT m.*, cc.title as canonical_title, cc.severity as canonical_severity
FROM tom_control_mappings m
LEFT JOIN canonical_controls cc ON cc.id = m.canonical_control_id
WHERE m.tenant_id = :tid
AND (m.project_id = :pid OR (m.project_id IS NULL AND :pid IS NULL))
"""
params: dict[str, Any] = {"tid": tenant_id, "pid": project_id}
if tom_category:
query += " AND m.tom_category = :tcat"
params["tcat"] = tom_category
if mapping_type:
query += " AND m.mapping_type = :mtype"
params["mtype"] = mapping_type
query += " ORDER BY m.tom_category, m.canonical_control_code"
query += " LIMIT :lim OFFSET :off"
params["lim"] = limit
params["off"] = offset
count_query = """
SELECT count(*) FROM tom_control_mappings
WHERE tenant_id = :tid
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
"""
count_params: dict[str, Any] = {"tid": tenant_id, "pid": project_id}
if tom_category:
count_query += " AND tom_category = :tcat"
count_params["tcat"] = tom_category
with SessionLocal() as db:
rows = db.execute(text(query), params).fetchall()
total = db.execute(text(count_query), count_params).scalar()
mappings = []
for r in rows:
d = _mapping_row_to_dict(r)
d["canonical_title"] = getattr(r, "canonical_title", None)
d["canonical_severity"] = getattr(r, "canonical_severity", None)
mappings.append(d)
return {"mappings": mappings, "total": total}
# =============================================================================
# MAPPINGS BY TOM CONTROL
# =============================================================================
@router.get("/by-tom/{tom_code}")
async def get_mappings_by_tom(
tom_code: str,
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
):
"""Get all canonical controls mapped to a specific TOM control code or category."""
tenant_id = _get_tenant_id(x_tenant_id)
with SessionLocal() as db:
rows = db.execute(
text("""
SELECT m.*, cc.title as canonical_title, cc.severity as canonical_severity,
cc.objective as canonical_objective
FROM tom_control_mappings m
LEFT JOIN canonical_controls cc ON cc.id = m.canonical_control_id
WHERE m.tenant_id = :tid
AND (m.project_id = :pid OR (m.project_id IS NULL AND :pid IS NULL))
AND (m.tom_control_code = :code OR m.tom_category = :code)
ORDER BY m.canonical_control_code
"""),
{"tid": tenant_id, "pid": project_id, "code": tom_code},
).fetchall()
mappings = []
for r in rows:
d = _mapping_row_to_dict(r)
d["canonical_title"] = getattr(r, "canonical_title", None)
d["canonical_severity"] = getattr(r, "canonical_severity", None)
d["canonical_objective"] = getattr(r, "canonical_objective", None)
mappings.append(d)
return {"tom_code": tom_code, "mappings": mappings, "total": len(mappings)}
# =============================================================================
# STATS
# =============================================================================
@router.get("/stats")
async def get_mapping_stats(
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
):
"""Coverage statistics for TOM ↔ canonical control mappings."""
tenant_id = _get_tenant_id(x_tenant_id)
with SessionLocal() as db:
# Sync state
sync_state = db.execute(
text("""
SELECT * FROM tom_control_sync_state
WHERE tenant_id = :tid
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
"""),
{"tid": tenant_id, "pid": project_id},
).fetchone()
# Per-category breakdown
category_stats = db.execute(
text("""
SELECT tom_category,
count(*) as total_mappings,
count(DISTINCT canonical_control_id) as unique_controls,
count(*) FILTER (WHERE mapping_type = 'auto') as auto_count,
count(*) FILTER (WHERE mapping_type = 'manual') as manual_count
FROM tom_control_mappings
WHERE tenant_id = :tid
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
GROUP BY tom_category
ORDER BY tom_category
"""),
{"tid": tenant_id, "pid": project_id},
).fetchall()
# Total canonical controls in DB (approved + visible)
total_canonical = db.execute(
text("""
SELECT count(*) FROM canonical_controls
WHERE release_state = 'approved' AND customer_visible = true
""")
).scalar()
return {
"sync_state": {
"profile_hash": sync_state.profile_hash if sync_state else None,
"total_mappings": sync_state.total_mappings if sync_state else 0,
"canonical_controls_matched": sync_state.canonical_controls_matched if sync_state else 0,
"tom_controls_covered": sync_state.tom_controls_covered if sync_state else 0,
"last_synced_at": sync_state.last_synced_at.isoformat() if sync_state and sync_state.last_synced_at else None,
},
"category_breakdown": [
{
"tom_category": r.tom_category,
"total_mappings": r.total_mappings,
"unique_controls": r.unique_controls,
"auto_count": r.auto_count,
"manual_count": r.manual_count,
}
for r in category_stats
],
"total_canonical_controls_available": total_canonical or 0,
}
# =============================================================================
# MANUAL MAPPING
# =============================================================================
@router.post("/manual", status_code=201)
async def add_manual_mapping(
body: ManualMappingRequest,
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
project_id: Optional[str] = Query(None),
):
"""Manually add a canonical control to a TOM measure."""
tenant_id = _get_tenant_id(x_tenant_id)
with SessionLocal() as db:
# Verify canonical control exists
cc = db.execute(
text("SELECT id, control_id, category FROM canonical_controls WHERE id = CAST(:cid AS uuid)"),
{"cid": body.canonical_control_id},
).fetchone()
if not cc:
raise HTTPException(status_code=404, detail="Canonical control not found")
try:
row = db.execute(
text("""
INSERT INTO tom_control_mappings (
tenant_id, project_id, tom_control_code, tom_category,
canonical_control_id, canonical_control_code, canonical_category,
mapping_type, relevance_score
) VALUES (
:tid, :pid, :tom_code, :tom_cat,
CAST(:cc_id AS uuid), :cc_code, :cc_category,
'manual', :score
)
RETURNING *
"""),
{
"tid": tenant_id,
"pid": project_id,
"tom_code": body.tom_control_code,
"tom_cat": body.tom_category,
"cc_id": body.canonical_control_id,
"cc_code": body.canonical_control_code,
"cc_category": body.canonical_category or cc.category,
"score": body.relevance_score,
},
).fetchone()
db.commit()
except Exception as e:
if "unique" in str(e).lower() or "duplicate" in str(e).lower():
raise HTTPException(status_code=409, detail="Mapping already exists")
raise
return _mapping_row_to_dict(row)
# =============================================================================
# DELETE MAPPING
# =============================================================================
@router.delete("/{mapping_id}", status_code=204)
async def delete_mapping(
mapping_id: str,
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
):
"""Remove a mapping (manual or auto)."""
tenant_id = _get_tenant_id(x_tenant_id)
with SessionLocal() as db:
result = db.execute(
text("""
DELETE FROM tom_control_mappings
WHERE id = CAST(:mid AS uuid) AND tenant_id = :tid
"""),
{"mid": mapping_id, "tid": tenant_id},
)
if result.rowcount == 0:
raise HTTPException(status_code=404, detail="Mapping not found")
db.commit()
return None
@@ -0,0 +1,427 @@
"""
FastAPI routes for VVT Master Libraries + Process Templates.
Library endpoints (read-only, global):
GET /vvt/libraries Overview: all library types + counts
GET /vvt/libraries/data-subjects Data subjects (filter: typical_for)
GET /vvt/libraries/data-categories Hierarchical (filter: parent_id, is_art9, flat)
GET /vvt/libraries/recipients Recipients (filter: type)
GET /vvt/libraries/legal-bases Legal bases (filter: is_art9, type)
GET /vvt/libraries/retention-rules Retention rules
GET /vvt/libraries/transfer-mechanisms Transfer mechanisms
GET /vvt/libraries/purposes Purposes (filter: typical_for)
GET /vvt/libraries/toms TOMs (filter: category)
Template endpoints:
GET /vvt/templates List templates (filter: business_function, search)
GET /vvt/templates/{id} Single template with resolved labels
POST /vvt/templates/{id}/instantiate Create VVT activity from template
"""
import logging
import uuid
from datetime import datetime
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from ..db.vvt_library_models import (
VVTLibDataSubjectDB,
VVTLibDataCategoryDB,
VVTLibRecipientDB,
VVTLibLegalBasisDB,
VVTLibRetentionRuleDB,
VVTLibTransferMechanismDB,
VVTLibPurposeDB,
VVTLibTomDB,
VVTProcessTemplateDB,
)
from ..db.vvt_models import VVTActivityDB, VVTAuditLogDB
from .tenant_utils import get_tenant_id
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/vvt", tags=["compliance-vvt-libraries"])
# ============================================================================
# Helper: row → dict
# ============================================================================
def _row_to_dict(row, extra_fields=None):
"""Generic row → dict for library items."""
d = {
"id": row.id,
"label_de": row.label_de,
}
if hasattr(row, 'description_de') and row.description_de:
d["description_de"] = row.description_de
if hasattr(row, 'sort_order'):
d["sort_order"] = row.sort_order
if extra_fields:
for f in extra_fields:
if hasattr(row, f):
val = getattr(row, f)
if val is not None:
d[f] = val
return d
# ============================================================================
# Library Overview
# ============================================================================
@router.get("/libraries")
async def get_libraries_overview(db: Session = Depends(get_db)):
"""Overview of all library types with item counts."""
return {
"libraries": [
{"type": "data-subjects", "count": db.query(VVTLibDataSubjectDB).count()},
{"type": "data-categories", "count": db.query(VVTLibDataCategoryDB).count()},
{"type": "recipients", "count": db.query(VVTLibRecipientDB).count()},
{"type": "legal-bases", "count": db.query(VVTLibLegalBasisDB).count()},
{"type": "retention-rules", "count": db.query(VVTLibRetentionRuleDB).count()},
{"type": "transfer-mechanisms", "count": db.query(VVTLibTransferMechanismDB).count()},
{"type": "purposes", "count": db.query(VVTLibPurposeDB).count()},
{"type": "toms", "count": db.query(VVTLibTomDB).count()},
]
}
# ============================================================================
# Data Subjects
# ============================================================================
@router.get("/libraries/data-subjects")
async def list_data_subjects(
typical_for: Optional[str] = Query(None, description="Filter by business function"),
db: Session = Depends(get_db),
):
query = db.query(VVTLibDataSubjectDB).order_by(VVTLibDataSubjectDB.sort_order)
rows = query.all()
items = [_row_to_dict(r, ["art9_relevant", "typical_for"]) for r in rows]
if typical_for:
items = [i for i in items if typical_for in (i.get("typical_for") or [])]
return items
# ============================================================================
# Data Categories (hierarchical)
# ============================================================================
@router.get("/libraries/data-categories")
async def list_data_categories(
flat: Optional[bool] = Query(False, description="Return flat list instead of tree"),
parent_id: Optional[str] = Query(None),
is_art9: Optional[bool] = Query(None),
db: Session = Depends(get_db),
):
query = db.query(VVTLibDataCategoryDB).order_by(VVTLibDataCategoryDB.sort_order)
if parent_id is not None:
query = query.filter(VVTLibDataCategoryDB.parent_id == parent_id)
if is_art9 is not None:
query = query.filter(VVTLibDataCategoryDB.is_art9 == is_art9)
rows = query.all()
extra = ["parent_id", "is_art9", "is_art10", "risk_weight", "default_retention_rule", "default_legal_basis"]
items = [_row_to_dict(r, extra) for r in rows]
if flat or parent_id is not None or is_art9 is not None:
return items
# Build tree
by_parent: dict = {}
for item in items:
pid = item.get("parent_id")
by_parent.setdefault(pid, []).append(item)
tree = []
for item in by_parent.get(None, []):
children = by_parent.get(item["id"], [])
if children:
item["children"] = children
tree.append(item)
return tree
# ============================================================================
# Recipients
# ============================================================================
@router.get("/libraries/recipients")
async def list_recipients(
type: Optional[str] = Query(None, description="INTERNAL, PROCESSOR, CONTROLLER, AUTHORITY"),
db: Session = Depends(get_db),
):
query = db.query(VVTLibRecipientDB).order_by(VVTLibRecipientDB.sort_order)
if type:
query = query.filter(VVTLibRecipientDB.type == type)
rows = query.all()
return [_row_to_dict(r, ["type", "is_third_country", "country"]) for r in rows]
# ============================================================================
# Legal Bases
# ============================================================================
@router.get("/libraries/legal-bases")
async def list_legal_bases(
is_art9: Optional[bool] = Query(None),
type: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
query = db.query(VVTLibLegalBasisDB).order_by(VVTLibLegalBasisDB.sort_order)
if is_art9 is not None:
query = query.filter(VVTLibLegalBasisDB.is_art9 == is_art9)
if type:
query = query.filter(VVTLibLegalBasisDB.type == type)
rows = query.all()
return [_row_to_dict(r, ["article", "type", "is_art9", "typical_national_law"]) for r in rows]
# ============================================================================
# Retention Rules
# ============================================================================
@router.get("/libraries/retention-rules")
async def list_retention_rules(db: Session = Depends(get_db)):
rows = db.query(VVTLibRetentionRuleDB).order_by(VVTLibRetentionRuleDB.sort_order).all()
return [_row_to_dict(r, ["legal_basis", "duration", "duration_unit", "start_event", "deletion_procedure"]) for r in rows]
# ============================================================================
# Transfer Mechanisms
# ============================================================================
@router.get("/libraries/transfer-mechanisms")
async def list_transfer_mechanisms(db: Session = Depends(get_db)):
rows = db.query(VVTLibTransferMechanismDB).order_by(VVTLibTransferMechanismDB.sort_order).all()
return [_row_to_dict(r, ["article", "requires_tia"]) for r in rows]
# ============================================================================
# Purposes
# ============================================================================
@router.get("/libraries/purposes")
async def list_purposes(
typical_for: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
rows = db.query(VVTLibPurposeDB).order_by(VVTLibPurposeDB.sort_order).all()
items = [_row_to_dict(r, ["typical_legal_basis", "typical_for"]) for r in rows]
if typical_for:
items = [i for i in items if typical_for in (i.get("typical_for") or [])]
return items
# ============================================================================
# TOMs
# ============================================================================
@router.get("/libraries/toms")
async def list_toms(
category: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
query = db.query(VVTLibTomDB).order_by(VVTLibTomDB.sort_order)
if category:
query = query.filter(VVTLibTomDB.category == category)
rows = query.all()
return [_row_to_dict(r, ["category", "art32_reference"]) for r in rows]
# ============================================================================
# Process Templates
# ============================================================================
def _template_to_dict(t: VVTProcessTemplateDB) -> dict:
return {
"id": t.id,
"name": t.name,
"description": t.description,
"business_function": t.business_function,
"purpose_refs": t.purpose_refs or [],
"legal_basis_refs": t.legal_basis_refs or [],
"data_subject_refs": t.data_subject_refs or [],
"data_category_refs": t.data_category_refs or [],
"recipient_refs": t.recipient_refs or [],
"tom_refs": t.tom_refs or [],
"transfer_mechanism_refs": t.transfer_mechanism_refs or [],
"retention_rule_ref": t.retention_rule_ref,
"typical_systems": t.typical_systems or [],
"protection_level": t.protection_level or "MEDIUM",
"dpia_required": t.dpia_required or False,
"risk_score": t.risk_score,
"tags": t.tags or [],
"is_system": t.is_system,
"sort_order": t.sort_order,
}
def _resolve_labels(template_dict: dict, db: Session) -> dict:
"""Resolve library IDs to labels within the template dict."""
resolvers = {
"purpose_refs": (VVTLibPurposeDB, "purpose_labels"),
"legal_basis_refs": (VVTLibLegalBasisDB, "legal_basis_labels"),
"data_subject_refs": (VVTLibDataSubjectDB, "data_subject_labels"),
"data_category_refs": (VVTLibDataCategoryDB, "data_category_labels"),
"recipient_refs": (VVTLibRecipientDB, "recipient_labels"),
"tom_refs": (VVTLibTomDB, "tom_labels"),
"transfer_mechanism_refs": (VVTLibTransferMechanismDB, "transfer_mechanism_labels"),
}
for refs_key, (model, labels_key) in resolvers.items():
ids = template_dict.get(refs_key) or []
if ids:
rows = db.query(model).filter(model.id.in_(ids)).all()
label_map = {r.id: r.label_de for r in rows}
template_dict[labels_key] = {rid: label_map.get(rid, rid) for rid in ids}
# Resolve single retention rule
rr = template_dict.get("retention_rule_ref")
if rr:
row = db.query(VVTLibRetentionRuleDB).filter(VVTLibRetentionRuleDB.id == rr).first()
if row:
template_dict["retention_rule_label"] = row.label_de
return template_dict
@router.get("/templates")
async def list_templates(
business_function: Optional[str] = Query(None),
search: Optional[str] = Query(None),
db: Session = Depends(get_db),
):
"""List process templates (system + tenant)."""
query = db.query(VVTProcessTemplateDB).order_by(VVTProcessTemplateDB.sort_order)
if business_function:
query = query.filter(VVTProcessTemplateDB.business_function == business_function)
if search:
term = f"%{search}%"
query = query.filter(
(VVTProcessTemplateDB.name.ilike(term)) |
(VVTProcessTemplateDB.description.ilike(term))
)
templates = query.all()
return [_template_to_dict(t) for t in templates]
@router.get("/templates/{template_id}")
async def get_template(
template_id: str,
db: Session = Depends(get_db),
):
"""Get a single template with resolved library labels."""
t = db.query(VVTProcessTemplateDB).filter(VVTProcessTemplateDB.id == template_id).first()
if not t:
raise HTTPException(status_code=404, detail=f"Template '{template_id}' not found")
result = _template_to_dict(t)
return _resolve_labels(result, db)
@router.post("/templates/{template_id}/instantiate", status_code=201)
async def instantiate_template(
template_id: str,
http_request: Request,
tid: str = Depends(get_tenant_id),
db: Session = Depends(get_db),
):
"""Create a new VVT activity from a process template."""
t = db.query(VVTProcessTemplateDB).filter(VVTProcessTemplateDB.id == template_id).first()
if not t:
raise HTTPException(status_code=404, detail=f"Template '{template_id}' not found")
# Generate unique VVT-ID
count = db.query(VVTActivityDB).filter(VVTActivityDB.tenant_id == tid).count()
vvt_id = f"VVT-{count + 1:04d}"
# Resolve library IDs to freetext labels for backward-compat fields
purpose_labels = _resolve_ids(db, VVTLibPurposeDB, t.purpose_refs or [])
legal_labels = _resolve_ids(db, VVTLibLegalBasisDB, t.legal_basis_refs or [])
subject_labels = _resolve_ids(db, VVTLibDataSubjectDB, t.data_subject_refs or [])
category_labels = _resolve_ids(db, VVTLibDataCategoryDB, t.data_category_refs or [])
recipient_labels = _resolve_ids(db, VVTLibRecipientDB, t.recipient_refs or [])
# Resolve retention rule
retention_period = {}
if t.retention_rule_ref:
rr = db.query(VVTLibRetentionRuleDB).filter(VVTLibRetentionRuleDB.id == t.retention_rule_ref).first()
if rr:
retention_period = {
"description": rr.label_de,
"legalBasis": rr.legal_basis or "",
"deletionProcedure": rr.deletion_procedure or "",
"duration": rr.duration,
"durationUnit": rr.duration_unit,
}
# Build structured TOMs from tom_refs
structured_toms = {"accessControl": [], "confidentiality": [], "integrity": [], "availability": [], "separation": []}
if t.tom_refs:
tom_rows = db.query(VVTLibTomDB).filter(VVTLibTomDB.id.in_(t.tom_refs)).all()
for tr in tom_rows:
cat = tr.category
if cat in structured_toms:
structured_toms[cat].append(tr.label_de)
act = VVTActivityDB(
tenant_id=tid,
vvt_id=vvt_id,
name=t.name,
description=t.description or "",
purposes=purpose_labels,
legal_bases=[{"type": lid, "description": lbl} for lid, lbl in zip(t.legal_basis_refs or [], legal_labels)],
data_subject_categories=subject_labels,
personal_data_categories=category_labels,
recipient_categories=[{"type": "unknown", "name": lbl} for lbl in recipient_labels],
retention_period=retention_period,
business_function=t.business_function,
systems=[{"systemId": s, "name": s} for s in (t.typical_systems or [])],
protection_level=t.protection_level or "MEDIUM",
dpia_required=t.dpia_required or False,
structured_toms=structured_toms,
status="DRAFT",
created_by=http_request.headers.get("X-User-ID", "system"),
# Library refs
purpose_refs=t.purpose_refs,
legal_basis_refs=t.legal_basis_refs,
data_subject_refs=t.data_subject_refs,
data_category_refs=t.data_category_refs,
recipient_refs=t.recipient_refs,
retention_rule_ref=t.retention_rule_ref,
transfer_mechanism_refs=t.transfer_mechanism_refs,
tom_refs=t.tom_refs,
source_template_id=t.id,
risk_score=t.risk_score,
)
db.add(act)
db.flush()
# Audit log
audit = VVTAuditLogDB(
tenant_id=tid,
action="CREATE",
entity_type="activity",
entity_id=act.id,
changed_by=http_request.headers.get("X-User-ID", "system"),
new_values={"vvt_id": vvt_id, "source_template_id": t.id, "name": t.name},
)
db.add(audit)
db.commit()
db.refresh(act)
# Return full response
from .vvt_routes import _activity_to_response
return _activity_to_response(act)
def _resolve_ids(db: Session, model, ids: list) -> list:
"""Resolve list of library IDs to list of label_de strings."""
if not ids:
return []
rows = db.query(model).filter(model.id.in_(ids)).all()
label_map = {r.id: r.label_de for r in rows}
return [label_map.get(i, i) for i in ids]
@@ -81,6 +81,54 @@ async def upsert_organization(
# Activities
# ============================================================================
def _activity_to_response(act: VVTActivityDB) -> VVTActivityResponse:
return VVTActivityResponse(
id=str(act.id),
vvt_id=act.vvt_id,
name=act.name,
description=act.description,
purposes=act.purposes or [],
legal_bases=act.legal_bases or [],
data_subject_categories=act.data_subject_categories or [],
personal_data_categories=act.personal_data_categories or [],
recipient_categories=act.recipient_categories or [],
third_country_transfers=act.third_country_transfers or [],
retention_period=act.retention_period or {},
tom_description=act.tom_description,
business_function=act.business_function,
systems=act.systems or [],
deployment_model=act.deployment_model,
data_sources=act.data_sources or [],
data_flows=act.data_flows or [],
protection_level=act.protection_level or 'MEDIUM',
dpia_required=act.dpia_required or False,
structured_toms=act.structured_toms or {},
status=act.status or 'DRAFT',
responsible=act.responsible,
owner=act.owner,
last_reviewed_at=act.last_reviewed_at,
next_review_at=act.next_review_at,
created_by=act.created_by,
dsfa_id=str(act.dsfa_id) if act.dsfa_id else None,
# Library refs
purpose_refs=act.purpose_refs,
legal_basis_refs=act.legal_basis_refs,
data_subject_refs=act.data_subject_refs,
data_category_refs=act.data_category_refs,
recipient_refs=act.recipient_refs,
retention_rule_ref=act.retention_rule_ref,
transfer_mechanism_refs=act.transfer_mechanism_refs,
tom_refs=act.tom_refs,
source_template_id=act.source_template_id,
risk_score=act.risk_score,
linked_loeschfristen_ids=act.linked_loeschfristen_ids,
linked_tom_measure_ids=act.linked_tom_measure_ids,
art30_completeness=act.art30_completeness,
created_at=act.created_at,
updated_at=act.updated_at,
)
@router.get("/activities", response_model=List[VVTActivityResponse])
async def list_activities(
status: Optional[str] = Query(None),
@@ -145,6 +193,107 @@ async def delete_activity(
return service.delete_activity(tid, activity_id)
# ============================================================================
# Art. 30 Completeness Check
# ============================================================================
@router.get("/activities/{activity_id}/completeness")
async def get_activity_completeness(
activity_id: str,
tid: str = Depends(get_tenant_id),
db: Session = Depends(get_db),
):
"""Calculate Art. 30 completeness score for a VVT activity."""
act = db.query(VVTActivityDB).filter(
VVTActivityDB.id == activity_id,
VVTActivityDB.tenant_id == tid,
).first()
if not act:
raise HTTPException(status_code=404, detail=f"Activity {activity_id} not found")
return _calculate_completeness(act)
def _calculate_completeness(act: VVTActivityDB) -> dict:
"""Calculate Art. 30 completeness — required fields per DSGVO Art. 30 Abs. 1."""
missing = []
warnings = []
total_checks = 10
passed = 0
# 1. Name/Zweck
if act.name:
passed += 1
else:
missing.append("name")
# 2. Verarbeitungszwecke
has_purposes = bool(act.purposes) or bool(act.purpose_refs)
if has_purposes:
passed += 1
else:
missing.append("purposes")
# 3. Rechtsgrundlage
has_legal = bool(act.legal_bases) or bool(act.legal_basis_refs)
if has_legal:
passed += 1
else:
missing.append("legal_bases")
# 4. Betroffenenkategorien
has_subjects = bool(act.data_subject_categories) or bool(act.data_subject_refs)
if has_subjects:
passed += 1
else:
missing.append("data_subjects")
# 5. Datenkategorien
has_categories = bool(act.personal_data_categories) or bool(act.data_category_refs)
if has_categories:
passed += 1
else:
missing.append("data_categories")
# 6. Empfaenger
has_recipients = bool(act.recipient_categories) or bool(act.recipient_refs)
if has_recipients:
passed += 1
else:
missing.append("recipients")
# 7. Drittland-Uebermittlung (checked but not strictly required)
passed += 1 # always passes — no transfer is valid state
# 8. Loeschfristen
has_retention = bool(act.retention_period and act.retention_period.get('description')) or bool(act.retention_rule_ref)
if has_retention:
passed += 1
else:
missing.append("retention_period")
# 9. TOM-Beschreibung
has_tom = bool(act.tom_description) or bool(act.tom_refs) or bool(act.structured_toms)
if has_tom:
passed += 1
else:
missing.append("tom_description")
# 10. Verantwortlicher
if act.responsible:
passed += 1
else:
missing.append("responsible")
# Warnings
if act.dpia_required and not act.dsfa_id:
warnings.append("dpia_required_but_no_dsfa_linked")
if act.third_country_transfers and not act.transfer_mechanism_refs:
warnings.append("third_country_transfer_without_mechanism")
score = int((passed / total_checks) * 100)
return {"score": score, "missing": missing, "warnings": warnings, "passed": passed, "total": total_checks}
# ============================================================================
# Audit Log
# ============================================================================
@@ -0,0 +1,443 @@
{
"framework_id": "CSA_CCM",
"display_name": "Cloud Security Alliance CCM v4",
"license": {
"type": "restricted",
"rag_allowed": false,
"use_as_metadata": true,
"note": "Abstrahierte Struktur — keine Originaltexte uebernommen"
},
"domains": [
{
"domain_id": "AIS",
"title": "Application and Interface Security",
"aliases": ["ais", "application and interface security", "anwendungssicherheit", "schnittstellensicherheit"],
"keywords": ["application", "anwendung", "interface", "schnittstelle", "api", "web", "eingabevalidierung"],
"subcontrols": [
{
"subcontrol_id": "AIS-01",
"title": "Application Security Policy",
"statement": "Sicherheitsrichtlinien fuer Anwendungsentwicklung und Schnittstellenmanagement muessen definiert und angewendet werden.",
"keywords": ["policy", "richtlinie", "entwicklung"],
"action_hint": "document",
"object_hint": "Anwendungssicherheitsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "AIS-02",
"title": "Application Security Design",
"statement": "Sicherheitsanforderungen muessen in den Entwurf jeder Anwendung integriert werden.",
"keywords": ["design", "entwurf", "security by design"],
"action_hint": "implement",
"object_hint": "Sicherheitsanforderungen im Anwendungsentwurf",
"object_class": "process"
},
{
"subcontrol_id": "AIS-03",
"title": "Application Security Testing",
"statement": "Anwendungen muessen vor dem Deployment und regelmaessig auf Sicherheitsschwachstellen getestet werden.",
"keywords": ["testing", "test", "sast", "dast", "penetration"],
"action_hint": "test",
"object_hint": "Anwendungssicherheitstests",
"object_class": "process"
},
{
"subcontrol_id": "AIS-04",
"title": "Secure Development Practices",
"statement": "Sichere Entwicklungspraktiken (Code Review, Pair Programming, SAST) muessen fuer alle Entwicklungsprojekte gelten.",
"keywords": ["development", "entwicklung", "code review", "sast", "praktiken"],
"action_hint": "implement",
"object_hint": "Sichere Entwicklungspraktiken",
"object_class": "process"
},
{
"subcontrol_id": "AIS-05",
"title": "API Security",
"statement": "APIs muessen authentifiziert, autorisiert und gegen Missbrauch geschuetzt werden.",
"keywords": ["api", "schnittstelle", "authentifizierung", "rate limiting"],
"action_hint": "implement",
"object_hint": "API-Sicherheitskontrollen",
"object_class": "interface"
},
{
"subcontrol_id": "AIS-06",
"title": "Automated Application Security Testing",
"statement": "Automatisierte Sicherheitstests muessen in die CI/CD-Pipeline integriert werden.",
"keywords": ["automatisiert", "ci/cd", "pipeline", "sast", "dast"],
"action_hint": "configure",
"object_hint": "Automatisierte Sicherheitstests in CI/CD",
"object_class": "configuration"
}
]
},
{
"domain_id": "BCR",
"title": "Business Continuity and Resilience",
"aliases": ["bcr", "business continuity", "resilience", "geschaeftskontinuitaet", "resilienz"],
"keywords": ["continuity", "kontinuitaet", "resilience", "resilienz", "disaster", "recovery", "backup"],
"subcontrols": [
{
"subcontrol_id": "BCR-01",
"title": "Business Continuity Planning",
"statement": "Ein Geschaeftskontinuitaetsplan muss erstellt, dokumentiert und regelmaessig getestet werden.",
"keywords": ["plan", "kontinuitaet", "geschaeft"],
"action_hint": "document",
"object_hint": "Geschaeftskontinuitaetsplan",
"object_class": "policy"
},
{
"subcontrol_id": "BCR-02",
"title": "Risk Assessment for BCM",
"statement": "Risikobewertungen muessen fuer geschaeftskritische Prozesse durchgefuehrt werden.",
"keywords": ["risiko", "bewertung", "kritisch"],
"action_hint": "assess",
"object_hint": "BCM-Risikobewertung",
"object_class": "risk_artifact"
},
{
"subcontrol_id": "BCR-03",
"title": "Backup and Recovery",
"statement": "Datensicherungen muessen regelmaessig erstellt und Wiederherstellungstests durchgefuehrt werden.",
"keywords": ["backup", "sicherung", "wiederherstellung", "recovery"],
"action_hint": "maintain",
"object_hint": "Datensicherung und Wiederherstellung",
"object_class": "technical_control"
},
{
"subcontrol_id": "BCR-04",
"title": "Disaster Recovery Planning",
"statement": "Ein Disaster-Recovery-Plan muss dokumentiert und jaehrlich getestet werden.",
"keywords": ["disaster", "recovery", "katastrophe"],
"action_hint": "document",
"object_hint": "Disaster-Recovery-Plan",
"object_class": "policy"
}
]
},
{
"domain_id": "CCC",
"title": "Change Control and Configuration Management",
"aliases": ["ccc", "change control", "configuration management", "aenderungsmanagement", "konfigurationsmanagement"],
"keywords": ["change", "aenderung", "konfiguration", "configuration", "release", "deployment"],
"subcontrols": [
{
"subcontrol_id": "CCC-01",
"title": "Change Management Policy",
"statement": "Ein Aenderungsmanagement-Prozess muss definiert und fuer alle Aenderungen angewendet werden.",
"keywords": ["policy", "richtlinie", "aenderung"],
"action_hint": "document",
"object_hint": "Aenderungsmanagement-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "CCC-02",
"title": "Change Testing",
"statement": "Aenderungen muessen vor der Produktivsetzung getestet und genehmigt werden.",
"keywords": ["test", "genehmigung", "approval"],
"action_hint": "test",
"object_hint": "Aenderungstests",
"object_class": "process"
},
{
"subcontrol_id": "CCC-03",
"title": "Configuration Baseline",
"statement": "Basiskonfigurationen fuer alle Systeme muessen definiert und dokumentiert werden.",
"keywords": ["baseline", "basis", "standard"],
"action_hint": "define",
"object_hint": "Konfigurationsbaseline",
"object_class": "configuration"
}
]
},
{
"domain_id": "CEK",
"title": "Cryptography, Encryption and Key Management",
"aliases": ["cek", "cryptography", "encryption", "key management", "kryptographie", "verschluesselung", "schluesselverwaltung"],
"keywords": ["kryptographie", "verschluesselung", "schluessel", "key", "encryption", "certificate", "zertifikat"],
"subcontrols": [
{
"subcontrol_id": "CEK-01",
"title": "Encryption Policy",
"statement": "Verschluesselungsrichtlinien muessen definiert werden, die Algorithmen, Schluessellaengen und Einsatzbereiche festlegen.",
"keywords": ["policy", "richtlinie", "algorithmus"],
"action_hint": "document",
"object_hint": "Verschluesselungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "CEK-02",
"title": "Key Management",
"statement": "Kryptographische Schluessel muessen ueber ihren Lebenszyklus sicher verwaltet werden.",
"keywords": ["key", "schluessel", "management", "lebenszyklus"],
"action_hint": "maintain",
"object_hint": "Schluesselverwaltung",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "CEK-03",
"title": "Data Encryption",
"statement": "Sensible Daten muessen bei Speicherung und Uebertragung verschluesselt werden.",
"keywords": ["data", "daten", "speicherung", "uebertragung"],
"action_hint": "encrypt",
"object_hint": "Datenverschluesselung",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "DSP",
"title": "Data Security and Privacy",
"aliases": ["dsp", "data security", "privacy", "datensicherheit", "datenschutz"],
"keywords": ["datenschutz", "datensicherheit", "privacy", "data security", "pii", "personenbezogen", "dsgvo"],
"subcontrols": [
{
"subcontrol_id": "DSP-01",
"title": "Data Classification",
"statement": "Daten muessen nach Sensibilitaet klassifiziert und entsprechend geschuetzt werden.",
"keywords": ["klassifizierung", "sensibilitaet", "classification"],
"action_hint": "define",
"object_hint": "Datenklassifizierung",
"object_class": "data"
},
{
"subcontrol_id": "DSP-02",
"title": "Data Inventory",
"statement": "Ein Dateninventar muss gefuehrt werden, das alle Verarbeitungen personenbezogener Daten dokumentiert.",
"keywords": ["inventar", "verzeichnis", "verarbeitung", "vvt"],
"action_hint": "maintain",
"object_hint": "Dateninventar",
"object_class": "register"
},
{
"subcontrol_id": "DSP-03",
"title": "Data Retention and Deletion",
"statement": "Aufbewahrungsfristen muessen definiert und Daten nach Ablauf sicher geloescht werden.",
"keywords": ["retention", "aufbewahrung", "loeschung", "frist"],
"action_hint": "delete",
"object_hint": "Datenloeschung nach Frist",
"object_class": "data"
},
{
"subcontrol_id": "DSP-04",
"title": "Privacy Impact Assessment",
"statement": "Datenschutz-Folgenabschaetzungen muessen fuer risikoreiche Verarbeitungen durchgefuehrt werden.",
"keywords": ["dsfa", "pia", "folgenabschaetzung", "impact"],
"action_hint": "assess",
"object_hint": "Datenschutz-Folgenabschaetzung",
"object_class": "risk_artifact"
},
{
"subcontrol_id": "DSP-05",
"title": "Data Subject Rights",
"statement": "Verfahren zur Bearbeitung von Betroffenenrechten muessen implementiert werden.",
"keywords": ["betroffenenrechte", "auskunft", "loeschung", "data subject"],
"action_hint": "implement",
"object_hint": "Betroffenenrechte-Verfahren",
"object_class": "process"
}
]
},
{
"domain_id": "GRC",
"title": "Governance, Risk and Compliance",
"aliases": ["grc", "governance", "risk", "compliance", "risikomanagement"],
"keywords": ["governance", "risiko", "compliance", "management", "policy", "richtlinie"],
"subcontrols": [
{
"subcontrol_id": "GRC-01",
"title": "Information Security Program",
"statement": "Ein umfassendes Informationssicherheitsprogramm muss etabliert und aufrechterhalten werden.",
"keywords": ["programm", "sicherheit", "information"],
"action_hint": "maintain",
"object_hint": "Informationssicherheitsprogramm",
"object_class": "policy"
},
{
"subcontrol_id": "GRC-02",
"title": "Risk Management Program",
"statement": "Ein Risikomanagement-Programm muss implementiert werden, das Identifikation, Bewertung und Behandlung umfasst.",
"keywords": ["risiko", "management", "bewertung", "behandlung"],
"action_hint": "implement",
"object_hint": "Risikomanagement-Programm",
"object_class": "process"
},
{
"subcontrol_id": "GRC-03",
"title": "Compliance Monitoring",
"statement": "Die Einhaltung regulatorischer und vertraglicher Anforderungen muss ueberwacht werden.",
"keywords": ["compliance", "einhaltung", "regulatorisch", "ueberwachung"],
"action_hint": "monitor",
"object_hint": "Compliance-Ueberwachung",
"object_class": "process"
}
]
},
{
"domain_id": "IAM",
"title": "Identity and Access Management",
"aliases": ["iam", "identity", "access management", "identitaetsmanagement", "zugriffsverwaltung"],
"keywords": ["identitaet", "zugriff", "identity", "access", "authentifizierung", "autorisierung", "sso"],
"subcontrols": [
{
"subcontrol_id": "IAM-01",
"title": "Identity and Access Policy",
"statement": "Identitaets- und Zugriffsmanagement-Richtlinien muessen definiert werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "IAM-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "IAM-02",
"title": "Strong Authentication",
"statement": "Starke Authentifizierung (MFA) muss fuer administrative und sicherheitskritische Zugriffe gefordert werden.",
"keywords": ["mfa", "stark", "authentifizierung", "admin"],
"action_hint": "implement",
"object_hint": "Starke Authentifizierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "IAM-03",
"title": "Identity Lifecycle Management",
"statement": "Identitaeten muessen ueber ihren gesamten Lebenszyklus verwaltet werden.",
"keywords": ["lifecycle", "lebenszyklus", "onboarding", "offboarding"],
"action_hint": "maintain",
"object_hint": "Identitaets-Lebenszyklus",
"object_class": "account"
},
{
"subcontrol_id": "IAM-04",
"title": "Access Review",
"statement": "Zugriffsrechte muessen regelmaessig ueberprueft und ueberschuessige Rechte entzogen werden.",
"keywords": ["review", "ueberpruefen", "rechte", "rezertifizierung"],
"action_hint": "review",
"object_hint": "Zugriffsrechte-Review",
"object_class": "access_control"
}
]
},
{
"domain_id": "LOG",
"title": "Logging and Monitoring",
"aliases": ["log", "logging", "monitoring", "protokollierung", "ueberwachung"],
"keywords": ["logging", "monitoring", "protokollierung", "ueberwachung", "siem", "alarm"],
"subcontrols": [
{
"subcontrol_id": "LOG-01",
"title": "Logging Policy",
"statement": "Protokollierungs-Richtlinien muessen definiert werden, die Umfang und Aufbewahrung festlegen.",
"keywords": ["policy", "richtlinie", "umfang", "aufbewahrung"],
"action_hint": "document",
"object_hint": "Protokollierungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "LOG-02",
"title": "Security Event Logging",
"statement": "Sicherheitsrelevante Ereignisse muessen erfasst und zentral gespeichert werden.",
"keywords": ["event", "ereignis", "sicherheit", "zentral"],
"action_hint": "configure",
"object_hint": "Sicherheits-Event-Logging",
"object_class": "configuration"
},
{
"subcontrol_id": "LOG-03",
"title": "Monitoring and Alerting",
"statement": "Sicherheitsrelevante Logs muessen ueberwacht und bei Anomalien Alarme ausgeloest werden.",
"keywords": ["monitoring", "alerting", "alarm", "anomalie"],
"action_hint": "monitor",
"object_hint": "Log-Ueberwachung und Alarmierung",
"object_class": "technical_control"
}
]
},
{
"domain_id": "SEF",
"title": "Security Incident Management",
"aliases": ["sef", "security incident", "incident management", "vorfallmanagement", "sicherheitsvorfall"],
"keywords": ["vorfall", "incident", "sicherheitsvorfall", "reaktion", "response", "meldung"],
"subcontrols": [
{
"subcontrol_id": "SEF-01",
"title": "Incident Management Policy",
"statement": "Ein Vorfallmanagement-Prozess muss definiert, dokumentiert und getestet werden.",
"keywords": ["policy", "richtlinie", "prozess"],
"action_hint": "document",
"object_hint": "Vorfallmanagement-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "SEF-02",
"title": "Incident Response Team",
"statement": "Ein Incident-Response-Team muss benannt und geschult werden.",
"keywords": ["team", "response", "schulung"],
"action_hint": "define",
"object_hint": "Incident-Response-Team",
"object_class": "role"
},
{
"subcontrol_id": "SEF-03",
"title": "Incident Reporting",
"statement": "Sicherheitsvorfaelle muessen innerhalb definierter Fristen an zustaendige Stellen gemeldet werden.",
"keywords": ["reporting", "meldung", "frist", "behoerde"],
"action_hint": "report",
"object_hint": "Vorfallmeldung",
"object_class": "incident"
},
{
"subcontrol_id": "SEF-04",
"title": "Incident Lessons Learned",
"statement": "Nach jedem Vorfall muss eine Nachbereitung mit Lessons Learned durchgefuehrt werden.",
"keywords": ["lessons learned", "nachbereitung", "verbesserung"],
"action_hint": "review",
"object_hint": "Vorfall-Nachbereitung",
"object_class": "record"
}
]
},
{
"domain_id": "TVM",
"title": "Threat and Vulnerability Management",
"aliases": ["tvm", "threat", "vulnerability", "schwachstelle", "bedrohung", "schwachstellenmanagement"],
"keywords": ["schwachstelle", "vulnerability", "threat", "bedrohung", "patch", "scan"],
"subcontrols": [
{
"subcontrol_id": "TVM-01",
"title": "Vulnerability Management Policy",
"statement": "Schwachstellenmanagement-Richtlinien muessen definiert und umgesetzt werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Schwachstellenmanagement-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "TVM-02",
"title": "Vulnerability Scanning",
"statement": "Systeme muessen regelmaessig auf Schwachstellen gescannt werden.",
"keywords": ["scan", "scanning", "regelmaessig"],
"action_hint": "test",
"object_hint": "Schwachstellenscan",
"object_class": "system"
},
{
"subcontrol_id": "TVM-03",
"title": "Vulnerability Remediation",
"statement": "Erkannte Schwachstellen muessen priorisiert und innerhalb definierter Fristen behoben werden.",
"keywords": ["remediation", "behebung", "frist", "priorisierung"],
"action_hint": "remediate",
"object_hint": "Schwachstellenbehebung",
"object_class": "system"
},
{
"subcontrol_id": "TVM-04",
"title": "Penetration Testing",
"statement": "Regelmaessige Penetrationstests muessen durchgefuehrt werden.",
"keywords": ["penetration", "pentest", "test"],
"action_hint": "test",
"object_hint": "Penetrationstest",
"object_class": "system"
}
]
}
]
}
@@ -0,0 +1,514 @@
{
"framework_id": "NIST_SP800_53",
"display_name": "NIST SP 800-53 Rev. 5",
"license": {
"type": "public_domain",
"rag_allowed": true,
"use_as_metadata": true
},
"domains": [
{
"domain_id": "AC",
"title": "Access Control",
"aliases": ["access control", "zugriffskontrolle", "zugriffssteuerung"],
"keywords": ["access", "zugriff", "berechtigung", "authorization", "autorisierung"],
"subcontrols": [
{
"subcontrol_id": "AC-1",
"title": "Access Control Policy and Procedures",
"statement": "Zugriffskontrollrichtlinien und -verfahren muessen definiert, dokumentiert und regelmaessig ueberprueft werden.",
"keywords": ["policy", "richtlinie", "verfahren", "procedures"],
"action_hint": "document",
"object_hint": "Zugriffskontrollrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "AC-2",
"title": "Account Management",
"statement": "Benutzerkonten muessen ueber ihren gesamten Lebenszyklus verwaltet werden: Erstellung, Aktivierung, Aenderung, Deaktivierung und Loeschung.",
"keywords": ["account", "konto", "benutzer", "lifecycle", "lebenszyklus"],
"action_hint": "maintain",
"object_hint": "Benutzerkontenverwaltung",
"object_class": "account"
},
{
"subcontrol_id": "AC-3",
"title": "Access Enforcement",
"statement": "Der Zugriff auf Systemressourcen muss gemaess der definierten Zugriffskontrollrichtlinie durchgesetzt werden.",
"keywords": ["enforcement", "durchsetzung", "ressourcen", "system"],
"action_hint": "restrict_access",
"object_hint": "Zugriffsdurchsetzung",
"object_class": "access_control"
},
{
"subcontrol_id": "AC-5",
"title": "Separation of Duties",
"statement": "Aufgabentrennung muss definiert und durchgesetzt werden, um Interessenkonflikte und Missbrauch zu verhindern.",
"keywords": ["separation", "trennung", "duties", "aufgaben", "funktionstrennung"],
"action_hint": "define",
"object_hint": "Aufgabentrennung",
"object_class": "role"
},
{
"subcontrol_id": "AC-6",
"title": "Least Privilege",
"statement": "Zugriffsrechte muessen nach dem Prinzip der minimalen Rechte vergeben werden.",
"keywords": ["least privilege", "minimal", "rechte", "privileg"],
"action_hint": "restrict_access",
"object_hint": "Minimale Rechtevergabe",
"object_class": "access_control"
},
{
"subcontrol_id": "AC-7",
"title": "Unsuccessful Logon Attempts",
"statement": "Fehlgeschlagene Anmeldeversuche muessen begrenzt und ueberwacht werden.",
"keywords": ["logon", "anmeldung", "fehlgeschlagen", "sperre", "lockout"],
"action_hint": "monitor",
"object_hint": "Anmeldeversuchsueberwachung",
"object_class": "technical_control"
},
{
"subcontrol_id": "AC-17",
"title": "Remote Access",
"statement": "Fernzugriff muss autorisiert, ueberwacht und verschluesselt werden.",
"keywords": ["remote", "fern", "vpn", "fernzugriff"],
"action_hint": "configure",
"object_hint": "Fernzugriffskonfiguration",
"object_class": "technical_control"
}
]
},
{
"domain_id": "AU",
"title": "Audit and Accountability",
"aliases": ["audit", "protokollierung", "accountability", "rechenschaftspflicht"],
"keywords": ["audit", "log", "protokoll", "nachvollziehbarkeit", "logging"],
"subcontrols": [
{
"subcontrol_id": "AU-1",
"title": "Audit Policy and Procedures",
"statement": "Audit- und Protokollierungsrichtlinien muessen definiert und regelmaessig ueberprueft werden.",
"keywords": ["policy", "richtlinie", "audit"],
"action_hint": "document",
"object_hint": "Auditrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "AU-2",
"title": "Event Logging",
"statement": "Sicherheitsrelevante Ereignisse muessen identifiziert und protokolliert werden.",
"keywords": ["event", "ereignis", "logging", "protokollierung"],
"action_hint": "configure",
"object_hint": "Ereignisprotokollierung",
"object_class": "configuration"
},
{
"subcontrol_id": "AU-3",
"title": "Content of Audit Records",
"statement": "Audit-Eintraege muessen ausreichende Informationen enthalten: Zeitstempel, Quelle, Ergebnis, Identitaet.",
"keywords": ["content", "inhalt", "record", "eintrag"],
"action_hint": "define",
"object_hint": "Audit-Eintragsformat",
"object_class": "record"
},
{
"subcontrol_id": "AU-6",
"title": "Audit Record Review and Reporting",
"statement": "Audit-Eintraege muessen regelmaessig ueberprueft und bei Anomalien berichtet werden.",
"keywords": ["review", "ueberpruefen", "reporting", "anomalie"],
"action_hint": "review",
"object_hint": "Audit-Ueberpruefung",
"object_class": "record"
},
{
"subcontrol_id": "AU-9",
"title": "Protection of Audit Information",
"statement": "Audit-Daten muessen vor unbefugtem Zugriff, Aenderung und Loeschung geschuetzt werden.",
"keywords": ["schutz", "protection", "integritaet", "integrity"],
"action_hint": "implement",
"object_hint": "Audit-Datenschutz",
"object_class": "technical_control"
}
]
},
{
"domain_id": "AT",
"title": "Awareness and Training",
"aliases": ["awareness", "training", "schulung", "sensibilisierung"],
"keywords": ["training", "schulung", "awareness", "sensibilisierung", "weiterbildung"],
"subcontrols": [
{
"subcontrol_id": "AT-1",
"title": "Policy and Procedures",
"statement": "Schulungs- und Sensibilisierungsrichtlinien muessen definiert und regelmaessig aktualisiert werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Schulungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "AT-2",
"title": "Literacy Training and Awareness",
"statement": "Alle Mitarbeiter muessen regelmaessig Sicherheitsschulungen erhalten.",
"keywords": ["mitarbeiter", "schulung", "sicherheit"],
"action_hint": "train",
"object_hint": "Sicherheitsschulung",
"object_class": "training"
},
{
"subcontrol_id": "AT-3",
"title": "Role-Based Training",
"statement": "Rollenbasierte Sicherheitsschulungen muessen fuer Mitarbeiter mit besonderen Sicherheitsaufgaben durchgefuehrt werden.",
"keywords": ["rollenbasiert", "role-based", "speziell"],
"action_hint": "train",
"object_hint": "Rollenbasierte Sicherheitsschulung",
"object_class": "training"
}
]
},
{
"domain_id": "CM",
"title": "Configuration Management",
"aliases": ["configuration management", "konfigurationsmanagement", "konfiguration"],
"keywords": ["konfiguration", "configuration", "baseline", "haertung", "hardening"],
"subcontrols": [
{
"subcontrol_id": "CM-1",
"title": "Policy and Procedures",
"statement": "Konfigurationsmanagement-Richtlinien muessen dokumentiert und gepflegt werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Konfigurationsmanagement-Richtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "CM-2",
"title": "Baseline Configuration",
"statement": "Basiskonfigurationen fuer Systeme muessen definiert, dokumentiert und gepflegt werden.",
"keywords": ["baseline", "basis", "standard"],
"action_hint": "define",
"object_hint": "Basiskonfiguration",
"object_class": "configuration"
},
{
"subcontrol_id": "CM-6",
"title": "Configuration Settings",
"statement": "Sicherheitsrelevante Konfigurationseinstellungen muessen definiert und durchgesetzt werden.",
"keywords": ["settings", "einstellungen", "sicherheit"],
"action_hint": "configure",
"object_hint": "Sicherheitskonfiguration",
"object_class": "configuration"
},
{
"subcontrol_id": "CM-7",
"title": "Least Functionality",
"statement": "Systeme muessen so konfiguriert werden, dass nur notwendige Funktionen aktiv sind.",
"keywords": ["least functionality", "minimal", "dienste", "ports"],
"action_hint": "configure",
"object_hint": "Minimalkonfiguration",
"object_class": "configuration"
},
{
"subcontrol_id": "CM-8",
"title": "System Component Inventory",
"statement": "Ein Inventar aller Systemkomponenten muss gefuehrt und aktuell gehalten werden.",
"keywords": ["inventar", "inventory", "komponenten", "assets"],
"action_hint": "maintain",
"object_hint": "Systemkomponenten-Inventar",
"object_class": "register"
}
]
},
{
"domain_id": "IA",
"title": "Identification and Authentication",
"aliases": ["identification", "authentication", "identifikation", "authentifizierung"],
"keywords": ["authentifizierung", "identifikation", "identity", "passwort", "mfa", "credential"],
"subcontrols": [
{
"subcontrol_id": "IA-1",
"title": "Policy and Procedures",
"statement": "Identifikations- und Authentifizierungsrichtlinien muessen dokumentiert und regelmaessig ueberprueft werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Authentifizierungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "IA-2",
"title": "Identification and Authentication",
"statement": "Benutzer und Geraete muessen eindeutig identifiziert und authentifiziert werden.",
"keywords": ["benutzer", "geraete", "identifizierung"],
"action_hint": "implement",
"object_hint": "Benutzerauthentifizierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "IA-2(1)",
"title": "Multi-Factor Authentication",
"statement": "Multi-Faktor-Authentifizierung muss fuer privilegierte Konten implementiert werden.",
"keywords": ["mfa", "multi-faktor", "zwei-faktor", "2fa"],
"action_hint": "implement",
"object_hint": "Multi-Faktor-Authentifizierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "IA-5",
"title": "Authenticator Management",
"statement": "Authentifizierungsmittel (Passwoerter, Token, Zertifikate) muessen sicher verwaltet werden.",
"keywords": ["passwort", "token", "zertifikat", "credential"],
"action_hint": "maintain",
"object_hint": "Authentifizierungsmittel-Verwaltung",
"object_class": "technical_control"
}
]
},
{
"domain_id": "IR",
"title": "Incident Response",
"aliases": ["incident response", "vorfallbehandlung", "vorfallreaktion", "incident management"],
"keywords": ["vorfall", "incident", "reaktion", "response", "breach", "sicherheitsvorfall"],
"subcontrols": [
{
"subcontrol_id": "IR-1",
"title": "Policy and Procedures",
"statement": "Vorfallreaktionsrichtlinien und -verfahren muessen definiert und regelmaessig aktualisiert werden.",
"keywords": ["policy", "richtlinie", "verfahren"],
"action_hint": "document",
"object_hint": "Vorfallreaktionsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "IR-2",
"title": "Incident Response Training",
"statement": "Mitarbeiter muessen regelmaessig in der Vorfallreaktion geschult werden.",
"keywords": ["training", "schulung"],
"action_hint": "train",
"object_hint": "Vorfallreaktionsschulung",
"object_class": "training"
},
{
"subcontrol_id": "IR-4",
"title": "Incident Handling",
"statement": "Ein strukturierter Prozess fuer die Vorfallbehandlung muss implementiert werden: Erkennung, Analyse, Eindaemmung, Behebung.",
"keywords": ["handling", "behandlung", "erkennung", "eindaemmung"],
"action_hint": "implement",
"object_hint": "Vorfallbehandlungsprozess",
"object_class": "process"
},
{
"subcontrol_id": "IR-5",
"title": "Incident Monitoring",
"statement": "Sicherheitsvorfaelle muessen kontinuierlich ueberwacht und verfolgt werden.",
"keywords": ["monitoring", "ueberwachung", "tracking"],
"action_hint": "monitor",
"object_hint": "Vorfallsueberwachung",
"object_class": "incident"
},
{
"subcontrol_id": "IR-6",
"title": "Incident Reporting",
"statement": "Sicherheitsvorfaelle muessen innerhalb definierter Fristen an die zustaendigen Stellen gemeldet werden.",
"keywords": ["reporting", "meldung", "melden", "frist"],
"action_hint": "report",
"object_hint": "Vorfallmeldung",
"object_class": "incident"
},
{
"subcontrol_id": "IR-8",
"title": "Incident Response Plan",
"statement": "Ein Vorfallreaktionsplan muss dokumentiert und regelmaessig getestet werden.",
"keywords": ["plan", "dokumentation", "test"],
"action_hint": "document",
"object_hint": "Vorfallreaktionsplan",
"object_class": "policy"
}
]
},
{
"domain_id": "RA",
"title": "Risk Assessment",
"aliases": ["risk assessment", "risikobewertung", "risikoanalyse"],
"keywords": ["risiko", "risk", "bewertung", "assessment", "analyse", "bedrohung", "threat"],
"subcontrols": [
{
"subcontrol_id": "RA-1",
"title": "Policy and Procedures",
"statement": "Risikobewertungsrichtlinien muessen dokumentiert und regelmaessig aktualisiert werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Risikobewertungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "RA-3",
"title": "Risk Assessment",
"statement": "Regelmaessige Risikobewertungen muessen durchgefuehrt und dokumentiert werden.",
"keywords": ["bewertung", "assessment", "regelmaessig"],
"action_hint": "assess",
"object_hint": "Risikobewertung",
"object_class": "risk_artifact"
},
{
"subcontrol_id": "RA-5",
"title": "Vulnerability Monitoring and Scanning",
"statement": "Systeme muessen regelmaessig auf Schwachstellen gescannt und ueberwacht werden.",
"keywords": ["vulnerability", "schwachstelle", "scan", "monitoring"],
"action_hint": "monitor",
"object_hint": "Schwachstellenueberwachung",
"object_class": "system"
}
]
},
{
"domain_id": "SC",
"title": "System and Communications Protection",
"aliases": ["system protection", "communications protection", "kommunikationsschutz", "systemschutz"],
"keywords": ["verschluesselung", "encryption", "tls", "netzwerk", "network", "kommunikation", "firewall"],
"subcontrols": [
{
"subcontrol_id": "SC-1",
"title": "Policy and Procedures",
"statement": "System- und Kommunikationsschutzrichtlinien muessen dokumentiert und aktuell gehalten werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Kommunikationsschutzrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "SC-7",
"title": "Boundary Protection",
"statement": "Netzwerkgrenzen muessen durch Firewall-Regeln und Zugangskontrollen geschuetzt werden.",
"keywords": ["boundary", "grenze", "firewall", "netzwerk"],
"action_hint": "implement",
"object_hint": "Netzwerkgrenzschutz",
"object_class": "technical_control"
},
{
"subcontrol_id": "SC-8",
"title": "Transmission Confidentiality and Integrity",
"statement": "Daten muessen bei der Uebertragung durch Verschluesselung geschuetzt werden.",
"keywords": ["transmission", "uebertragung", "verschluesselung", "tls"],
"action_hint": "encrypt",
"object_hint": "Uebertragungsverschluesselung",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "SC-12",
"title": "Cryptographic Key Establishment and Management",
"statement": "Kryptographische Schluessel muessen sicher erzeugt, verteilt, gespeichert und widerrufen werden.",
"keywords": ["key", "schluessel", "kryptographie", "management"],
"action_hint": "maintain",
"object_hint": "Schluesselverwaltung",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "SC-13",
"title": "Cryptographic Protection",
"statement": "Kryptographische Mechanismen muessen gemaess anerkannten Standards implementiert werden.",
"keywords": ["kryptographie", "verschluesselung", "standard"],
"action_hint": "implement",
"object_hint": "Kryptographischer Schutz",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "SI",
"title": "System and Information Integrity",
"aliases": ["system integrity", "information integrity", "systemintegritaet", "informationsintegritaet"],
"keywords": ["integritaet", "integrity", "malware", "patch", "flaw", "schwachstelle"],
"subcontrols": [
{
"subcontrol_id": "SI-1",
"title": "Policy and Procedures",
"statement": "System- und Informationsintegritaetsrichtlinien muessen dokumentiert und regelmaessig ueberprueft werden.",
"keywords": ["policy", "richtlinie"],
"action_hint": "document",
"object_hint": "Integritaetsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "SI-2",
"title": "Flaw Remediation",
"statement": "Bekannte Schwachstellen muessen innerhalb definierter Fristen behoben werden.",
"keywords": ["flaw", "schwachstelle", "patch", "behebung", "remediation"],
"action_hint": "remediate",
"object_hint": "Schwachstellenbehebung",
"object_class": "system"
},
{
"subcontrol_id": "SI-3",
"title": "Malicious Code Protection",
"statement": "Systeme muessen vor Schadsoftware geschuetzt werden durch Erkennung und Abwehrmechanismen.",
"keywords": ["malware", "schadsoftware", "antivirus", "erkennung"],
"action_hint": "implement",
"object_hint": "Schadsoftwareschutz",
"object_class": "technical_control"
},
{
"subcontrol_id": "SI-4",
"title": "System Monitoring",
"statement": "Systeme muessen kontinuierlich auf Sicherheitsereignisse und Anomalien ueberwacht werden.",
"keywords": ["monitoring", "ueberwachung", "anomalie", "siem"],
"action_hint": "monitor",
"object_hint": "Systemueberwachung",
"object_class": "system"
},
{
"subcontrol_id": "SI-5",
"title": "Security Alerts and Advisories",
"statement": "Sicherheitswarnungen muessen empfangen, bewertet und darauf reagiert werden.",
"keywords": ["alert", "warnung", "advisory", "cve"],
"action_hint": "monitor",
"object_hint": "Sicherheitswarnungen",
"object_class": "incident"
}
]
},
{
"domain_id": "SA",
"title": "System and Services Acquisition",
"aliases": ["system acquisition", "services acquisition", "systembeschaffung", "secure development"],
"keywords": ["beschaffung", "acquisition", "entwicklung", "development", "lieferkette", "supply chain"],
"subcontrols": [
{
"subcontrol_id": "SA-1",
"title": "Policy and Procedures",
"statement": "Beschaffungsrichtlinien mit Sicherheitsanforderungen muessen dokumentiert werden.",
"keywords": ["policy", "richtlinie", "beschaffung"],
"action_hint": "document",
"object_hint": "Beschaffungsrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "SA-8",
"title": "Security and Privacy Engineering Principles",
"statement": "Sicherheits- und Datenschutzprinzipien muessen in die Systementwicklung integriert werden.",
"keywords": ["engineering", "development", "prinzipien", "design"],
"action_hint": "implement",
"object_hint": "Security-by-Design-Prinzipien",
"object_class": "process"
},
{
"subcontrol_id": "SA-11",
"title": "Developer Testing and Evaluation",
"statement": "Entwickler muessen Sicherheitstests und Code-Reviews durchfuehren.",
"keywords": ["testing", "test", "code review", "evaluation"],
"action_hint": "test",
"object_hint": "Entwickler-Sicherheitstests",
"object_class": "process"
},
{
"subcontrol_id": "SA-12",
"title": "Supply Chain Protection",
"statement": "Lieferkettenrisiken muessen bewertet und Schutzmassnahmen implementiert werden.",
"keywords": ["supply chain", "lieferkette", "third party", "drittanbieter"],
"action_hint": "assess",
"object_hint": "Lieferkettenrisikobewertung",
"object_class": "risk_artifact"
}
]
}
]
}
@@ -0,0 +1,353 @@
{
"framework_id": "OWASP_ASVS",
"display_name": "OWASP Application Security Verification Standard 4.0",
"license": {
"type": "cc_by_sa_4",
"rag_allowed": true,
"use_as_metadata": true
},
"domains": [
{
"domain_id": "V1",
"title": "Architecture, Design and Threat Modeling",
"aliases": ["architecture", "architektur", "design", "threat modeling", "bedrohungsmodellierung"],
"keywords": ["architektur", "design", "threat model", "bedrohung", "modellierung"],
"subcontrols": [
{
"subcontrol_id": "V1.1",
"title": "Secure Software Development Lifecycle",
"statement": "Ein sicherer Softwareentwicklungs-Lebenszyklus (SSDLC) muss definiert und angewendet werden.",
"keywords": ["sdlc", "lifecycle", "lebenszyklus", "entwicklung"],
"action_hint": "implement",
"object_hint": "Sicherer Entwicklungs-Lebenszyklus",
"object_class": "process"
},
{
"subcontrol_id": "V1.2",
"title": "Authentication Architecture",
"statement": "Die Authentifizierungsarchitektur muss dokumentiert und regelmaessig ueberprueft werden.",
"keywords": ["authentication", "authentifizierung", "architektur"],
"action_hint": "document",
"object_hint": "Authentifizierungsarchitektur",
"object_class": "policy"
},
{
"subcontrol_id": "V1.4",
"title": "Access Control Architecture",
"statement": "Die Zugriffskontrollarchitektur muss dokumentiert und zentral durchgesetzt werden.",
"keywords": ["access control", "zugriffskontrolle", "architektur"],
"action_hint": "document",
"object_hint": "Zugriffskontrollarchitektur",
"object_class": "policy"
},
{
"subcontrol_id": "V1.5",
"title": "Input and Output Architecture",
"statement": "Eingabe- und Ausgabevalidierung muss architektonisch verankert und durchgaengig angewendet werden.",
"keywords": ["input", "output", "eingabe", "ausgabe", "validierung"],
"action_hint": "implement",
"object_hint": "Ein-/Ausgabevalidierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V1.6",
"title": "Cryptographic Architecture",
"statement": "Kryptographische Mechanismen muessen architektonisch definiert und standardisiert sein.",
"keywords": ["crypto", "kryptographie", "verschluesselung"],
"action_hint": "define",
"object_hint": "Kryptographie-Architektur",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "V2",
"title": "Authentication",
"aliases": ["authentication", "authentifizierung", "anmeldung", "login"],
"keywords": ["authentication", "authentifizierung", "passwort", "login", "anmeldung", "credential"],
"subcontrols": [
{
"subcontrol_id": "V2.1",
"title": "Password Security",
"statement": "Passwortrichtlinien muessen Mindestlaenge, Komplexitaet und Sperrmechanismen definieren.",
"keywords": ["passwort", "password", "laenge", "komplexitaet"],
"action_hint": "define",
"object_hint": "Passwortrichtlinie",
"object_class": "policy"
},
{
"subcontrol_id": "V2.2",
"title": "General Authenticator Security",
"statement": "Authentifizierungsmittel muessen sicher gespeichert und uebertragen werden.",
"keywords": ["authenticator", "credential", "speicherung"],
"action_hint": "implement",
"object_hint": "Sichere Credential-Verwaltung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V2.7",
"title": "Out-of-Band Verification",
"statement": "Out-of-Band-Verifikationsmechanismen muessen sicher implementiert werden.",
"keywords": ["oob", "out-of-band", "sms", "push"],
"action_hint": "implement",
"object_hint": "Out-of-Band-Verifikation",
"object_class": "technical_control"
},
{
"subcontrol_id": "V2.8",
"title": "Multi-Factor Authentication",
"statement": "Multi-Faktor-Authentifizierung muss fuer sicherheitskritische Funktionen verfuegbar sein.",
"keywords": ["mfa", "multi-faktor", "totp", "fido"],
"action_hint": "implement",
"object_hint": "Multi-Faktor-Authentifizierung",
"object_class": "technical_control"
}
]
},
{
"domain_id": "V3",
"title": "Session Management",
"aliases": ["session", "sitzung", "session management", "sitzungsverwaltung"],
"keywords": ["session", "sitzung", "token", "cookie", "timeout"],
"subcontrols": [
{
"subcontrol_id": "V3.1",
"title": "Session Management Security",
"statement": "Sitzungstoken muessen sicher erzeugt, uebertragen und invalidiert werden.",
"keywords": ["token", "sitzung", "sicherheit"],
"action_hint": "implement",
"object_hint": "Sichere Sitzungsverwaltung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V3.3",
"title": "Session Termination",
"statement": "Sitzungen muessen nach Inaktivitaet und bei Abmeldung zuverlaessig beendet werden.",
"keywords": ["termination", "timeout", "abmeldung", "beenden"],
"action_hint": "configure",
"object_hint": "Sitzungstimeout",
"object_class": "configuration"
},
{
"subcontrol_id": "V3.5",
"title": "Token-Based Session Management",
"statement": "Tokenbasierte Sitzungsmechanismen muessen gegen Diebstahl und Replay geschuetzt sein.",
"keywords": ["jwt", "token", "replay", "diebstahl"],
"action_hint": "implement",
"object_hint": "Token-Schutz",
"object_class": "technical_control"
}
]
},
{
"domain_id": "V5",
"title": "Validation, Sanitization and Encoding",
"aliases": ["validation", "validierung", "sanitization", "encoding", "eingabevalidierung"],
"keywords": ["validierung", "sanitization", "encoding", "xss", "injection", "eingabe"],
"subcontrols": [
{
"subcontrol_id": "V5.1",
"title": "Input Validation",
"statement": "Alle Eingabedaten muessen serverseitig validiert werden.",
"keywords": ["input", "eingabe", "validierung", "serverseitig"],
"action_hint": "implement",
"object_hint": "Eingabevalidierung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V5.2",
"title": "Sanitization and Sandboxing",
"statement": "Eingaben muessen bereinigt und in sicherer Umgebung verarbeitet werden.",
"keywords": ["sanitization", "bereinigung", "sandbox"],
"action_hint": "implement",
"object_hint": "Eingabebereinigung",
"object_class": "technical_control"
},
{
"subcontrol_id": "V5.3",
"title": "Output Encoding and Injection Prevention",
"statement": "Ausgaben muessen kontextabhaengig kodiert werden, um Injection-Angriffe zu verhindern.",
"keywords": ["output", "encoding", "injection", "xss", "sql"],
"action_hint": "implement",
"object_hint": "Ausgabe-Encoding",
"object_class": "technical_control"
}
]
},
{
"domain_id": "V6",
"title": "Stored Cryptography",
"aliases": ["cryptography", "kryptographie", "verschluesselung", "stored cryptography"],
"keywords": ["kryptographie", "verschluesselung", "hashing", "schluessel", "key management"],
"subcontrols": [
{
"subcontrol_id": "V6.1",
"title": "Data Classification",
"statement": "Daten muessen klassifiziert und entsprechend ihrer Schutzklasse behandelt werden.",
"keywords": ["klassifizierung", "classification", "schutzklasse"],
"action_hint": "define",
"object_hint": "Datenklassifizierung",
"object_class": "data"
},
{
"subcontrol_id": "V6.2",
"title": "Algorithms",
"statement": "Nur zugelassene und aktuelle kryptographische Algorithmen duerfen verwendet werden.",
"keywords": ["algorithmus", "algorithm", "aes", "rsa"],
"action_hint": "configure",
"object_hint": "Kryptographische Algorithmen",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "V6.4",
"title": "Secret Management",
"statement": "Geheimnisse (Schluessel, Passwoerter, Tokens) muessen in einem Secret-Management-System verwaltet werden.",
"keywords": ["secret", "geheimnis", "vault", "key management"],
"action_hint": "maintain",
"object_hint": "Secret-Management",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "V8",
"title": "Data Protection",
"aliases": ["data protection", "datenschutz", "datenverarbeitung"],
"keywords": ["datenschutz", "data protection", "pii", "personenbezogen", "privacy"],
"subcontrols": [
{
"subcontrol_id": "V8.1",
"title": "General Data Protection",
"statement": "Personenbezogene Daten muessen gemaess Datenschutzanforderungen geschuetzt werden.",
"keywords": ["personenbezogen", "pii", "datenschutz"],
"action_hint": "implement",
"object_hint": "Datenschutzmassnahmen",
"object_class": "data"
},
{
"subcontrol_id": "V8.2",
"title": "Client-Side Data Protection",
"statement": "Clientseitig gespeicherte sensible Daten muessen geschuetzt und minimiert werden.",
"keywords": ["client", "browser", "localstorage", "cookie"],
"action_hint": "implement",
"object_hint": "Clientseitiger Datenschutz",
"object_class": "technical_control"
},
{
"subcontrol_id": "V8.3",
"title": "Sensitive Private Data",
"statement": "Sensible Daten muessen bei Speicherung und Verarbeitung besonders geschuetzt werden.",
"keywords": ["sensibel", "vertraulich", "speicherung"],
"action_hint": "encrypt",
"object_hint": "Verschluesselung sensibler Daten",
"object_class": "data"
}
]
},
{
"domain_id": "V9",
"title": "Communication",
"aliases": ["communication", "kommunikation", "tls", "transport"],
"keywords": ["tls", "ssl", "https", "transport", "kommunikation", "verschluesselung"],
"subcontrols": [
{
"subcontrol_id": "V9.1",
"title": "Client Communication Security",
"statement": "Alle Client-Server-Kommunikation muss ueber TLS verschluesselt werden.",
"keywords": ["tls", "https", "client", "server"],
"action_hint": "encrypt",
"object_hint": "TLS-Transportverschluesselung",
"object_class": "cryptographic_control"
},
{
"subcontrol_id": "V9.2",
"title": "Server Communication Security",
"statement": "Server-zu-Server-Kommunikation muss authentifiziert und verschluesselt erfolgen.",
"keywords": ["server", "mtls", "backend"],
"action_hint": "encrypt",
"object_hint": "Server-Kommunikationsverschluesselung",
"object_class": "cryptographic_control"
}
]
},
{
"domain_id": "V13",
"title": "API and Web Service",
"aliases": ["api", "web service", "rest", "graphql", "webservice"],
"keywords": ["api", "rest", "graphql", "webservice", "endpoint", "schnittstelle"],
"subcontrols": [
{
"subcontrol_id": "V13.1",
"title": "Generic Web Service Security",
"statement": "Web-Services muessen gegen gaengige Angriffe abgesichert werden.",
"keywords": ["web service", "sicherheit", "angriff"],
"action_hint": "implement",
"object_hint": "Web-Service-Absicherung",
"object_class": "interface"
},
{
"subcontrol_id": "V13.2",
"title": "RESTful Web Service",
"statement": "REST-APIs muessen Input-Validierung, Rate Limiting und sichere Authentifizierung implementieren.",
"keywords": ["rest", "api", "rate limiting", "input"],
"action_hint": "implement",
"object_hint": "REST-API-Absicherung",
"object_class": "interface"
},
{
"subcontrol_id": "V13.4",
"title": "GraphQL and Web Services",
"statement": "GraphQL-Endpoints muessen gegen Query-Complexity-Angriffe und Introspection geschuetzt werden.",
"keywords": ["graphql", "query", "complexity", "introspection"],
"action_hint": "configure",
"object_hint": "GraphQL-Absicherung",
"object_class": "interface"
}
]
},
{
"domain_id": "V14",
"title": "Configuration",
"aliases": ["configuration", "konfiguration", "hardening", "haertung"],
"keywords": ["konfiguration", "hardening", "haertung", "header", "deployment"],
"subcontrols": [
{
"subcontrol_id": "V14.1",
"title": "Build and Deploy",
"statement": "Build- und Deployment-Prozesse muessen sicher konfiguriert und reproduzierbar sein.",
"keywords": ["build", "deploy", "ci/cd", "pipeline"],
"action_hint": "configure",
"object_hint": "Sichere Build-Pipeline",
"object_class": "configuration"
},
{
"subcontrol_id": "V14.2",
"title": "Dependency Management",
"statement": "Abhaengigkeiten muessen auf Schwachstellen geprueft und aktuell gehalten werden.",
"keywords": ["dependency", "abhaengigkeit", "sca", "sbom"],
"action_hint": "maintain",
"object_hint": "Abhaengigkeitsverwaltung",
"object_class": "system"
},
{
"subcontrol_id": "V14.3",
"title": "Unintended Security Disclosure",
"statement": "Fehlermeldungen und Debug-Informationen duerfen keine sicherheitsrelevanten Details preisgeben.",
"keywords": ["disclosure", "fehlermeldung", "debug", "information leakage"],
"action_hint": "configure",
"object_hint": "Fehlerbehandlung",
"object_class": "configuration"
},
{
"subcontrol_id": "V14.4",
"title": "HTTP Security Headers",
"statement": "HTTP-Sicherheitsheader muessen korrekt konfiguriert sein.",
"keywords": ["header", "csp", "hsts", "x-frame"],
"action_hint": "configure",
"object_hint": "HTTP-Sicherheitsheader",
"object_class": "configuration"
}
]
}
]
}
@@ -0,0 +1,205 @@
"""
Source-Type-Klassifikation fuer Regulierungen und Frameworks.
Dreistufiges Modell der normativen Verbindlichkeit:
Stufe 1 GESETZ (law):
Rechtlich bindend. Bussgeld bei Verstoss.
Beispiele: DSGVO, NIS2, AI Act, CRA
Stufe 2 LEITLINIE (guideline):
Offizielle Auslegungshilfe von Aufsichtsbehoerden.
Beweislastumkehr: Wer abweicht, muss begruenden warum.
Beispiele: EDPB-Leitlinien, BSI-Standards, WP29-Dokumente
Stufe 3 FRAMEWORK (framework):
Freiwillige Best Practices, nicht rechtsverbindlich.
Aber: Koennen als "Stand der Technik" herangezogen werden.
Beispiele: ENISA, NIST, OWASP, OECD, CISA
Mapping: source_regulation (aus control_parent_links) -> source_type
"""
# --- Typ-Definitionen ---
SOURCE_TYPE_LAW = "law" # Gesetz/Verordnung/Richtlinie — normative_strength bleibt
SOURCE_TYPE_GUIDELINE = "guideline" # Leitlinie/Standard — max "should"
SOURCE_TYPE_FRAMEWORK = "framework" # Framework/Best Practice — max "may"
# Max erlaubte normative_strength pro source_type
# DB-Constraint erlaubt: must, should, may (NICHT "can")
NORMATIVE_STRENGTH_CAP: dict[str, str] = {
SOURCE_TYPE_LAW: "must", # keine Begrenzung
SOURCE_TYPE_GUIDELINE: "should", # max "should"
SOURCE_TYPE_FRAMEWORK: "may", # max "may" (= "kann")
}
# Reihenfolge fuer Vergleiche (hoeher = staerker)
STRENGTH_ORDER: dict[str, int] = {
"may": 1, # KANN (DB-Wert)
"can": 1, # Alias — wird in cap_normative_strength zu "may" normalisiert
"should": 2,
"must": 3,
}
def cap_normative_strength(original: str, source_type: str) -> str:
"""
Begrenzt die normative_strength basierend auf dem source_type.
Beispiel:
cap_normative_strength("must", "framework") -> "may"
cap_normative_strength("should", "law") -> "should"
cap_normative_strength("must", "guideline") -> "should"
"""
cap = NORMATIVE_STRENGTH_CAP.get(source_type, "must")
cap_level = STRENGTH_ORDER.get(cap, 3)
original_level = STRENGTH_ORDER.get(original, 3)
if original_level > cap_level:
return cap
return original
def get_highest_source_type(source_types: list[str]) -> str:
"""
Bestimmt den hoechsten source_type aus einer Liste.
Ein Gesetz uebertrumpft alles.
Beispiel:
get_highest_source_type(["framework", "law"]) -> "law"
get_highest_source_type(["framework", "guideline"]) -> "guideline"
"""
type_order = {SOURCE_TYPE_FRAMEWORK: 1, SOURCE_TYPE_GUIDELINE: 2, SOURCE_TYPE_LAW: 3}
if not source_types:
return SOURCE_TYPE_FRAMEWORK
return max(source_types, key=lambda t: type_order.get(t, 0))
# ============================================================================
# Klassifikation: source_regulation -> source_type
#
# Diese Map wird fuer den Backfill und zukuenftige Pipeline-Runs verwendet.
# Neue Regulierungen hier eintragen!
# ============================================================================
SOURCE_REGULATION_CLASSIFICATION: dict[str, str] = {
# --- EU-Verordnungen (unmittelbar bindend) ---
"DSGVO (EU) 2016/679": SOURCE_TYPE_LAW,
"KI-Verordnung (EU) 2024/1689": SOURCE_TYPE_LAW,
"Cyber Resilience Act (CRA)": SOURCE_TYPE_LAW,
"NIS2-Richtlinie (EU) 2022/2555": SOURCE_TYPE_LAW,
"Data Act": SOURCE_TYPE_LAW,
"Data Governance Act (DGA)": SOURCE_TYPE_LAW,
"Markets in Crypto-Assets (MiCA)": SOURCE_TYPE_LAW,
"Maschinenverordnung (EU) 2023/1230": SOURCE_TYPE_LAW,
"Batterieverordnung (EU) 2023/1542": SOURCE_TYPE_LAW,
"AML-Verordnung": SOURCE_TYPE_LAW,
# --- EU-Richtlinien (nach nationaler Umsetzung bindend) ---
# Fuer Compliance-Zwecke wie Gesetze behandeln
# --- Nationale Gesetze ---
"Bundesdatenschutzgesetz (BDSG)": SOURCE_TYPE_LAW,
"Telekommunikationsgesetz": SOURCE_TYPE_LAW,
"Telekommunikationsgesetz Oesterreich": SOURCE_TYPE_LAW,
"Gewerbeordnung (GewO)": SOURCE_TYPE_LAW,
"Handelsgesetzbuch (HGB)": SOURCE_TYPE_LAW,
"Abgabenordnung (AO)": SOURCE_TYPE_LAW,
"IFRS-Übernahmeverordnung": SOURCE_TYPE_LAW,
"Österreichisches Datenschutzgesetz (DSG)": SOURCE_TYPE_LAW,
"LOPDGDD - Ley Orgánica de Protección de Datos (Spanien)": SOURCE_TYPE_LAW,
"Loi Informatique et Libertés (Frankreich)": SOURCE_TYPE_LAW,
"Információs önrendelkezési jog törvény (Ungarn)": SOURCE_TYPE_LAW,
"EU Blue Guide 2022": SOURCE_TYPE_LAW,
# --- EDPB/WP29 Leitlinien (offizielle Auslegungshilfe) ---
"EDPB Leitlinien 01/2019 (Zertifizierung)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 01/2020 (Datentransfers)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 01/2020 (Vernetzte Fahrzeuge)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 01/2022 (BCR)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 01/2024 (Berechtigtes Interesse)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 04/2019 (Data Protection by Design)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 05/2020 - Einwilligung": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 07/2020 (Datentransfers)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 08/2020 (Social Media)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 09/2022 (Data Breach)": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien 09/2022 - Meldung von Datenschutzverletzungen": SOURCE_TYPE_GUIDELINE,
"EDPB Empfehlungen 01/2020 - Ergaenzende Massnahmen fuer Datentransfers": SOURCE_TYPE_GUIDELINE,
"EDPB Leitlinien - Berechtigtes Interesse (Art. 6(1)(f))": SOURCE_TYPE_GUIDELINE,
"WP244 Leitlinien (Profiling)": SOURCE_TYPE_GUIDELINE,
"WP251 Leitlinien (Profiling)": SOURCE_TYPE_GUIDELINE,
"WP260 Leitlinien (Transparenz)": SOURCE_TYPE_GUIDELINE,
# --- BSI Standards (behoerdliche technische Richtlinien) ---
"BSI-TR-03161-1": SOURCE_TYPE_GUIDELINE,
"BSI-TR-03161-2": SOURCE_TYPE_GUIDELINE,
"BSI-TR-03161-3": SOURCE_TYPE_GUIDELINE,
# --- ENISA (EU-Agentur, aber Empfehlungen nicht rechtsverbindlich) ---
"ENISA Cybersecurity State 2024": SOURCE_TYPE_FRAMEWORK,
"ENISA ICS/SCADA Dependencies": SOURCE_TYPE_FRAMEWORK,
"ENISA Supply Chain Good Practices": SOURCE_TYPE_FRAMEWORK,
"ENISA Threat Landscape Supply Chain": SOURCE_TYPE_FRAMEWORK,
# --- NIST (US-Standards, international als Best Practice) ---
"NIST AI Risk Management Framework": SOURCE_TYPE_FRAMEWORK,
"NIST Cybersecurity Framework 2.0": SOURCE_TYPE_FRAMEWORK,
"NIST SP 800-207 (Zero Trust)": SOURCE_TYPE_FRAMEWORK,
"NIST SP 800-218 (SSDF)": SOURCE_TYPE_FRAMEWORK,
"NIST SP 800-53 Rev. 5": SOURCE_TYPE_FRAMEWORK,
"NIST SP 800-63-3": SOURCE_TYPE_FRAMEWORK,
# --- OWASP (Community-Standards) ---
"OWASP API Security Top 10 (2023)": SOURCE_TYPE_FRAMEWORK,
"OWASP ASVS 4.0": SOURCE_TYPE_FRAMEWORK,
"OWASP MASVS 2.0": SOURCE_TYPE_FRAMEWORK,
"OWASP SAMM 2.0": SOURCE_TYPE_FRAMEWORK,
"OWASP Top 10 (2021)": SOURCE_TYPE_FRAMEWORK,
# --- Sonstige Frameworks ---
"OECD KI-Empfehlung": SOURCE_TYPE_FRAMEWORK,
"CISA Secure by Design": SOURCE_TYPE_FRAMEWORK,
}
def classify_source_regulation(source_regulation: str) -> str:
"""
Klassifiziert eine source_regulation als law, guideline oder framework.
Verwendet exaktes Matching gegen die Map. Bei unbekannten Quellen
wird anhand von Schluesselwoertern geraten, Fallback ist 'framework'
(konservativstes Ergebnis).
"""
if not source_regulation:
return SOURCE_TYPE_FRAMEWORK
# Exaktes Match
if source_regulation in SOURCE_REGULATION_CLASSIFICATION:
return SOURCE_REGULATION_CLASSIFICATION[source_regulation]
# Heuristik fuer unbekannte Quellen
lower = source_regulation.lower()
# Gesetze erkennen
law_indicators = [
"verordnung", "richtlinie", "gesetz", "directive", "regulation",
"(eu)", "(eg)", "act", "ley", "loi", "törvény", "código",
]
if any(ind in lower for ind in law_indicators):
return SOURCE_TYPE_LAW
# Leitlinien erkennen
guideline_indicators = [
"edpb", "leitlinie", "guideline", "wp2", "bsi", "empfehlung",
]
if any(ind in lower for ind in guideline_indicators):
return SOURCE_TYPE_GUIDELINE
# Frameworks erkennen
framework_indicators = [
"enisa", "nist", "owasp", "oecd", "cisa", "framework", "iso",
]
if any(ind in lower for ind in framework_indicators):
return SOURCE_TYPE_FRAMEWORK
# Konservativ: unbekannt = framework (geringste Verbindlichkeit)
return SOURCE_TYPE_FRAMEWORK
@@ -8,12 +8,16 @@ from .models import (
EvidenceDB,
RiskDB,
AuditExportDB,
LLMGenerationAuditDB,
AssertionDB,
RegulationTypeEnum,
ControlTypeEnum,
ControlDomainEnum,
RiskLevelEnum,
EvidenceStatusEnum,
ControlStatusEnum,
EvidenceConfidenceEnum,
EvidenceTruthStatusEnum,
)
from .repository import (
RegulationRepository,
@@ -33,6 +37,8 @@ __all__ = [
"EvidenceDB",
"RiskDB",
"AuditExportDB",
"LLMGenerationAuditDB",
"AssertionDB",
# Enums
"RegulationTypeEnum",
"ControlTypeEnum",
@@ -40,6 +46,8 @@ __all__ = [
"RiskLevelEnum",
"EvidenceStatusEnum",
"ControlStatusEnum",
"EvidenceConfidenceEnum",
"EvidenceTruthStatusEnum",
# Repositories
"RegulationRepository",
"RequirementRepository",
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,164 @@
"""
SQLAlchemy models for VVT Master Libraries + Process Templates.
Tables (global, no tenant_id):
- vvt_lib_data_subjects
- vvt_lib_data_categories (hierarchical, self-referencing)
- vvt_lib_recipients
- vvt_lib_legal_bases
- vvt_lib_retention_rules
- vvt_lib_transfer_mechanisms
- vvt_lib_purposes
- vvt_lib_toms
Tenant-scoped:
- vvt_process_templates (system + tenant-specific)
"""
from datetime import datetime
from sqlalchemy import (
Column, String, Text, Boolean, Integer, DateTime, JSON, Index,
ForeignKey,
)
from sqlalchemy.dialects.postgresql import UUID
from classroom_engine.database import Base
class VVTLibDataSubjectDB(Base):
__tablename__ = 'vvt_lib_data_subjects'
id = Column(String(50), primary_key=True)
label_de = Column(String(200), nullable=False)
description_de = Column(Text)
art9_relevant = Column(Boolean, default=False)
typical_for = Column(JSON, default=list)
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibDataCategoryDB(Base):
__tablename__ = 'vvt_lib_data_categories'
id = Column(String(50), primary_key=True)
parent_id = Column(String(50), ForeignKey('vvt_lib_data_categories.id', ondelete='SET NULL'), nullable=True)
label_de = Column(String(200), nullable=False)
description_de = Column(Text)
is_art9 = Column(Boolean, default=False)
is_art10 = Column(Boolean, default=False)
risk_weight = Column(Integer, default=1)
default_retention_rule = Column(String(50))
default_legal_basis = Column(String(50))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibRecipientDB(Base):
__tablename__ = 'vvt_lib_recipients'
id = Column(String(50), primary_key=True)
type = Column(String(20), nullable=False)
label_de = Column(String(200), nullable=False)
description_de = Column(Text)
is_third_country = Column(Boolean, default=False)
country = Column(String(5))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibLegalBasisDB(Base):
__tablename__ = 'vvt_lib_legal_bases'
id = Column(String(50), primary_key=True)
article = Column(String(50), nullable=False)
type = Column(String(30), nullable=False)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
is_art9 = Column(Boolean, default=False)
typical_national_law = Column(String(100))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibRetentionRuleDB(Base):
__tablename__ = 'vvt_lib_retention_rules'
id = Column(String(50), primary_key=True)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
legal_basis = Column(String(200))
duration = Column(Integer, nullable=False)
duration_unit = Column(String(10), nullable=False)
start_event = Column(String(200))
deletion_procedure = Column(String(500))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibTransferMechanismDB(Base):
__tablename__ = 'vvt_lib_transfer_mechanisms'
id = Column(String(50), primary_key=True)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
article = Column(String(50))
requires_tia = Column(Boolean, default=False)
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibPurposeDB(Base):
__tablename__ = 'vvt_lib_purposes'
id = Column(String(50), primary_key=True)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
typical_legal_basis = Column(String(50))
typical_for = Column(JSON, default=list)
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTLibTomDB(Base):
__tablename__ = 'vvt_lib_toms'
id = Column(String(50), primary_key=True)
category = Column(String(30), nullable=False)
label_de = Column(String(300), nullable=False)
description_de = Column(Text)
art32_reference = Column(String(100))
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
class VVTProcessTemplateDB(Base):
__tablename__ = 'vvt_process_templates'
id = Column(String(80), primary_key=True)
name = Column(String(300), nullable=False)
description = Column(Text)
business_function = Column(String(50))
purpose_refs = Column(JSON, default=list)
legal_basis_refs = Column(JSON, default=list)
data_subject_refs = Column(JSON, default=list)
data_category_refs = Column(JSON, default=list)
recipient_refs = Column(JSON, default=list)
tom_refs = Column(JSON, default=list)
transfer_mechanism_refs = Column(JSON, default=list)
retention_rule_ref = Column(String(50))
typical_systems = Column(JSON, default=list)
protection_level = Column(String(10), default='MEDIUM')
dpia_required = Column(Boolean, default=False)
risk_score = Column(Integer)
tags = Column(JSON, default=list)
is_system = Column(Boolean, default=True)
tenant_id = Column(UUID(as_uuid=True), nullable=True)
sort_order = Column(Integer, default=0)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
__table_args__ = (
Index('idx_vvt_process_templates_bf', 'business_function'),
Index('idx_vvt_process_templates_system', 'is_system'),
)
@@ -79,6 +79,26 @@ class VVTActivityDB(Base):
next_review_at = Column(DateTime(timezone=True), nullable=True)
created_by = Column(String(200), default='system')
dsfa_id = Column(UUID(as_uuid=True), nullable=True)
# Library refs (Phase 1 — parallel to freetext fields)
purpose_refs = Column(JSON, nullable=True)
legal_basis_refs = Column(JSON, nullable=True)
data_subject_refs = Column(JSON, nullable=True)
data_category_refs = Column(JSON, nullable=True)
recipient_refs = Column(JSON, nullable=True)
retention_rule_ref = Column(String(50), nullable=True)
transfer_mechanism_refs = Column(JSON, nullable=True)
tom_refs = Column(JSON, nullable=True)
# Cross-module links
linked_loeschfristen_ids = Column(JSON, nullable=True)
linked_tom_measure_ids = Column(JSON, nullable=True)
# Template + risk
source_template_id = Column(String(80), nullable=True)
risk_score = Column(Integer, nullable=True)
art30_completeness = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
@@ -69,7 +69,7 @@ class AnchorFinder:
tags_str = " ".join(control.tags[:3]) if control.tags else ""
query = f"{control.title} {tags_str}".strip()
results = await self.rag.search(
results = await self.rag.search_with_rerank(
query=query,
collection="bp_compliance_ce",
top_k=15,
@@ -0,0 +1,80 @@
"""Assertion Engine — splits text into sentences and classifies each.
Each sentence is tagged as:
- assertion: normative statement (pflicht / empfehlung / kann)
- fact: references concrete evidence artifacts
- rationale: explains why something is required
"""
import re
from typing import Optional
from .normative_patterns import (
PFLICHT_RE, EMPFEHLUNG_RE, KANN_RE, RATIONALE_RE, EVIDENCE_RE,
)
# Sentence splitter: period/excl/question followed by space+uppercase, or newlines
_SENTENCE_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ])|(?:\n\s*\n)')
def extract_assertions(
text: str,
entity_type: str,
entity_id: str,
tenant_id: Optional[str] = None,
) -> list[dict]:
"""Split *text* into sentences and classify each one.
Returns a list of dicts ready for AssertionDB creation.
"""
if not text or not text.strip():
return []
sentences = _SENTENCE_SPLIT.split(text.strip())
results: list[dict] = []
for idx, raw in enumerate(sentences):
sentence = raw.strip()
if not sentence or len(sentence) < 5:
continue
assertion_type, normative_tier = _classify_sentence(sentence)
results.append({
"tenant_id": tenant_id,
"entity_type": entity_type,
"entity_id": entity_id,
"sentence_text": sentence,
"sentence_index": idx,
"assertion_type": assertion_type,
"normative_tier": normative_tier,
"evidence_ids": [],
"confidence": 0.0,
})
return results
def _classify_sentence(sentence: str) -> tuple[str, Optional[str]]:
"""Return (assertion_type, normative_tier) for a single sentence."""
# 1. Check for evidence/fact keywords first
if EVIDENCE_RE.search(sentence):
return ("fact", None)
# 2. Check for rationale
normative_count = len(PFLICHT_RE.findall(sentence)) + len(EMPFEHLUNG_RE.findall(sentence)) + len(KANN_RE.findall(sentence))
rationale_count = len(RATIONALE_RE.findall(sentence))
if rationale_count > 0 and rationale_count >= normative_count:
return ("rationale", None)
# 3. Normative classification
if PFLICHT_RE.search(sentence):
return ("assertion", "pflicht")
if EMPFEHLUNG_RE.search(sentence):
return ("assertion", "empfehlung")
if KANN_RE.search(sentence):
return ("assertion", "kann")
# 4. Default: unclassified assertion
return ("assertion", None)
@@ -0,0 +1,618 @@
"""Batch Dedup Runner — Orchestrates deduplication of ~85k atomare Controls.
Reduces Pass 0b controls from ~85k to ~18-25k unique Master Controls via:
Phase 1: Intra-Group Dedup same merge_group_hint pick best, link rest
(85k ~52k, mostly title-identical short-circuit, no embeddings)
Phase 2: Cross-Group Dedup embed masters, search Qdrant for similar
masters with different hints (52k ~18-25k)
All Pass 0b controls have pattern_id=NULL. The primary grouping key is
merge_group_hint (format: "action_type:norm_obj:trigger_key"), which
encodes the normalized action, object, and trigger.
Usage:
runner = BatchDedupRunner(db)
stats = await runner.run(dry_run=True) # preview
stats = await runner.run(dry_run=False) # execute
stats = await runner.run(hint_filter="implement:multi_factor_auth:none")
"""
import json
import logging
import time
from collections import defaultdict
from sqlalchemy import text
from compliance.services.control_dedup import (
canonicalize_text,
ensure_qdrant_collection,
get_embedding,
normalize_action,
normalize_object,
qdrant_search_cross_regulation,
qdrant_upsert,
LINK_THRESHOLD,
REVIEW_THRESHOLD,
)
logger = logging.getLogger(__name__)
DEDUP_COLLECTION = "atomic_controls_dedup"
# ── Quality Score ────────────────────────────────────────────────────────
def quality_score(control: dict) -> float:
"""Score a control by richness of requirements, tests, evidence, and objective.
Higher score = better candidate for master control.
"""
score = 0.0
reqs = control.get("requirements") or "[]"
if isinstance(reqs, str):
try:
reqs = json.loads(reqs)
except (json.JSONDecodeError, TypeError):
reqs = []
score += len(reqs) * 2.0
tests = control.get("test_procedure") or "[]"
if isinstance(tests, str):
try:
tests = json.loads(tests)
except (json.JSONDecodeError, TypeError):
tests = []
score += len(tests) * 1.5
evidence = control.get("evidence") or "[]"
if isinstance(evidence, str):
try:
evidence = json.loads(evidence)
except (json.JSONDecodeError, TypeError):
evidence = []
score += len(evidence) * 1.0
objective = control.get("objective") or ""
score += min(len(objective) / 200, 3.0)
return score
# ── Batch Dedup Runner ───────────────────────────────────────────────────
class BatchDedupRunner:
"""Batch dedup orchestrator for existing Pass 0b atomic controls."""
def __init__(self, db, collection: str = DEDUP_COLLECTION):
self.db = db
self.collection = collection
self.stats = {
"total_controls": 0,
"unique_hints": 0,
"phase1_groups_processed": 0,
"masters": 0,
"linked": 0,
"review": 0,
"new_controls": 0,
"parent_links_transferred": 0,
"cross_group_linked": 0,
"cross_group_review": 0,
"errors": 0,
"skipped_title_identical": 0,
}
self._progress_phase = ""
self._progress_count = 0
self._progress_total = 0
async def run(
self,
dry_run: bool = False,
hint_filter: str = None,
) -> dict:
"""Run the full batch dedup pipeline.
Args:
dry_run: If True, compute stats but don't modify DB/Qdrant.
hint_filter: If set, only process groups matching this hint prefix.
Returns:
Stats dict with counts.
"""
start = time.monotonic()
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)",
dry_run, hint_filter)
if not dry_run:
await ensure_qdrant_collection(collection=self.collection)
# Phase 1: Intra-group dedup (same merge_group_hint)
self._progress_phase = "phase1"
groups = self._load_merge_groups(hint_filter)
self._progress_total = self.stats["total_controls"]
for hint, controls in groups:
try:
await self._process_hint_group(hint, controls, dry_run)
self.stats["phase1_groups_processed"] += 1
except Exception as e:
logger.error("BatchDedup Phase 1 error on hint %s: %s", hint, e)
self.stats["errors"] += 1
try:
self.db.rollback()
except Exception:
pass
logger.info(
"BatchDedup Phase 1 done: %d masters, %d linked, %d review",
self.stats["masters"], self.stats["linked"], self.stats["review"],
)
# Phase 2: Cross-group dedup via embeddings
if not dry_run:
self._progress_phase = "phase2"
await self._run_cross_group_pass()
elapsed = time.monotonic() - start
self.stats["elapsed_seconds"] = round(elapsed, 1)
logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
return self.stats
def _load_merge_groups(self, hint_filter: str = None) -> list:
"""Load all Pass 0b controls grouped by merge_group_hint, largest first."""
conditions = [
"decomposition_method = 'pass0b'",
"release_state != 'deprecated'",
"release_state != 'duplicate'",
]
params = {}
if hint_filter:
conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf")
params["hf"] = f"{hint_filter}%"
where = " AND ".join(conditions)
rows = self.db.execute(text(f"""
SELECT id::text, control_id, title, objective,
pattern_id, requirements::text, test_procedure::text,
evidence::text, release_state,
generation_metadata->>'merge_group_hint' as merge_group_hint,
generation_metadata->>'action_object_class' as action_object_class
FROM canonical_controls
WHERE {where}
ORDER BY control_id
"""), params).fetchall()
by_hint = defaultdict(list)
for r in rows:
by_hint[r[9] or ""].append({
"uuid": r[0],
"control_id": r[1],
"title": r[2],
"objective": r[3],
"pattern_id": r[4],
"requirements": r[5],
"test_procedure": r[6],
"evidence": r[7],
"release_state": r[8],
"merge_group_hint": r[9] or "",
"action_object_class": r[10] or "",
})
self.stats["total_controls"] = len(rows)
self.stats["unique_hints"] = len(by_hint)
sorted_groups = sorted(by_hint.items(), key=lambda x: len(x[1]), reverse=True)
logger.info("BatchDedup loaded %d controls in %d hint groups",
len(rows), len(sorted_groups))
return sorted_groups
def _sub_group_by_merge_hint(self, controls: list) -> dict:
"""Group controls by merge_group_hint composite key."""
groups = defaultdict(list)
for c in controls:
hint = c["merge_group_hint"]
if hint:
groups[hint].append(c)
else:
groups[f"__no_hint_{c['uuid']}"].append(c)
return dict(groups)
async def _process_hint_group(
self,
hint: str,
controls: list,
dry_run: bool,
):
"""Process all controls sharing the same merge_group_hint.
Within a hint group, all controls share action+object+trigger.
The best-quality control becomes master, rest are linked as duplicates.
"""
if len(controls) < 2:
# Singleton → always master
self.stats["masters"] += 1
if not dry_run:
await self._embed_and_index(controls[0])
self._progress_count += 1
self._log_progress(hint)
return
# Sort by quality score (best first)
sorted_group = sorted(controls, key=quality_score, reverse=True)
master = sorted_group[0]
self.stats["masters"] += 1
if not dry_run:
await self._embed_and_index(master)
for candidate in sorted_group[1:]:
# All share the same hint → check title similarity
if candidate["title"].strip().lower() == master["title"].strip().lower():
# Identical title → direct link (no embedding needed)
self.stats["linked"] += 1
self.stats["skipped_title_identical"] += 1
if not dry_run:
await self._mark_duplicate(master, candidate, confidence=1.0)
else:
# Different title within same hint → still likely duplicate
# Use embedding to verify
await self._check_and_link_within_group(master, candidate, dry_run)
self._progress_count += 1
self._log_progress(hint)
async def _check_and_link_within_group(
self,
master: dict,
candidate: dict,
dry_run: bool,
):
"""Check if candidate (same hint group) is duplicate of master via embedding."""
parts = candidate["merge_group_hint"].split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
canonical = canonicalize_text(action, obj, candidate["title"])
embedding = await get_embedding(canonical)
if not embedding:
# Can't embed → link anyway (same hint = same action+object)
self.stats["linked"] += 1
if not dry_run:
await self._mark_duplicate(master, candidate, confidence=0.90)
return
# Search the dedup collection (unfiltered — pattern_id is NULL)
results = await qdrant_search_cross_regulation(
embedding, top_k=3, collection=self.collection,
)
if not results:
# No Qdrant matches yet (master might not be indexed yet) → link to master
self.stats["linked"] += 1
if not dry_run:
await self._mark_duplicate(master, candidate, confidence=0.90)
return
best = results[0]
best_score = best.get("score", 0.0)
best_payload = best.get("payload", {})
best_uuid = best_payload.get("control_uuid", "")
if best_score > LINK_THRESHOLD:
self.stats["linked"] += 1
if not dry_run:
await self._mark_duplicate_to(best_uuid, candidate, confidence=best_score)
elif best_score > REVIEW_THRESHOLD:
self.stats["review"] += 1
if not dry_run:
self._write_review(candidate, best_payload, best_score)
else:
# Very different despite same hint → new master
self.stats["new_controls"] += 1
if not dry_run:
await self._index_with_embedding(candidate, embedding)
async def _run_cross_group_pass(self):
"""Phase 2: Find cross-group duplicates among surviving masters.
After Phase 1, ~52k masters remain. Many have similar semantics
despite different merge_group_hints (e.g. different German spellings).
This pass embeds all masters and finds near-duplicates via Qdrant.
"""
logger.info("BatchDedup Phase 2: Cross-group pass starting...")
rows = self.db.execute(text("""
SELECT id::text, control_id, title,
generation_metadata->>'merge_group_hint' as merge_group_hint
FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate'
AND release_state != 'deprecated'
ORDER BY control_id
""")).fetchall()
self._progress_total = len(rows)
self._progress_count = 0
logger.info("BatchDedup Cross-group: %d masters to check", len(rows))
cross_linked = 0
cross_review = 0
for i, r in enumerate(rows):
uuid = r[0]
hint = r[3] or ""
parts = hint.split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
canonical = canonicalize_text(action, obj, r[2])
embedding = await get_embedding(canonical)
if not embedding:
continue
results = await qdrant_search_cross_regulation(
embedding, top_k=5, collection=self.collection,
)
if not results:
continue
# Find best match from a DIFFERENT hint group
for match in results:
match_score = match.get("score", 0.0)
match_payload = match.get("payload", {})
match_uuid = match_payload.get("control_uuid", "")
# Skip self-match
if match_uuid == uuid:
continue
# Must be a different hint group (otherwise already handled in Phase 1)
match_action = match_payload.get("action_normalized", "")
match_object = match_payload.get("object_normalized", "")
# Simple check: different control UUID is enough
if match_score > LINK_THRESHOLD:
# Mark the worse one as duplicate
try:
self.db.execute(text("""
UPDATE canonical_controls
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
WHERE id = CAST(:dup AS uuid)
AND release_state != 'duplicate'
"""), {"master": match_uuid, "dup": uuid})
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence)
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {"cu": match_uuid, "pu": uuid, "conf": match_score})
# Transfer parent links
transferred = self._transfer_parent_links(match_uuid, uuid)
self.stats["parent_links_transferred"] += transferred
self.db.commit()
cross_linked += 1
except Exception as e:
logger.error("BatchDedup cross-group link error %s%s: %s",
uuid, match_uuid, e)
self.db.rollback()
self.stats["errors"] += 1
break # Only one cross-link per control
elif match_score > REVIEW_THRESHOLD:
self._write_review(
{"control_id": r[1], "title": r[2], "objective": "",
"merge_group_hint": hint, "pattern_id": None},
match_payload, match_score,
)
cross_review += 1
break
self._progress_count = i + 1
if (i + 1) % 500 == 0:
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
i + 1, len(rows), cross_linked, cross_review)
self.stats["cross_group_linked"] = cross_linked
self.stats["cross_group_review"] = cross_review
logger.info("BatchDedup Cross-group complete: %d linked, %d review",
cross_linked, cross_review)
# ── Qdrant Helpers ───────────────────────────────────────────────────
async def _embed_and_index(self, control: dict):
"""Compute embedding and index a control in the dedup Qdrant collection."""
parts = control["merge_group_hint"].split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
canonical = canonicalize_text(action, obj, control["title"])
embedding = await get_embedding(canonical)
if not embedding:
return
await qdrant_upsert(
point_id=control["uuid"],
embedding=embedding,
payload={
"control_uuid": control["uuid"],
"control_id": control["control_id"],
"title": control["title"],
"pattern_id": control.get("pattern_id"),
"action_normalized": norm_action,
"object_normalized": norm_object,
"canonical_text": canonical,
"merge_group_hint": control["merge_group_hint"],
},
collection=self.collection,
)
async def _index_with_embedding(self, control: dict, embedding: list):
"""Index a control with a pre-computed embedding."""
parts = control["merge_group_hint"].split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
canonical = canonicalize_text(action, obj, control["title"])
await qdrant_upsert(
point_id=control["uuid"],
embedding=embedding,
payload={
"control_uuid": control["uuid"],
"control_id": control["control_id"],
"title": control["title"],
"pattern_id": control.get("pattern_id"),
"action_normalized": norm_action,
"object_normalized": norm_object,
"canonical_text": canonical,
"merge_group_hint": control["merge_group_hint"],
},
collection=self.collection,
)
# ── DB Write Helpers ─────────────────────────────────────────────────
async def _mark_duplicate(self, master: dict, candidate: dict, confidence: float):
"""Mark candidate as duplicate of master, transfer parent links."""
try:
self.db.execute(text("""
UPDATE canonical_controls
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
WHERE id = CAST(:cand AS uuid)
"""), {"master": master["uuid"], "cand": candidate["uuid"]})
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence)
VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {"master": master["uuid"], "cand_parent": candidate["uuid"], "conf": confidence})
transferred = self._transfer_parent_links(master["uuid"], candidate["uuid"])
self.stats["parent_links_transferred"] += transferred
self.db.commit()
except Exception as e:
logger.error("BatchDedup _mark_duplicate error %s%s: %s",
candidate["uuid"], master["uuid"], e)
self.db.rollback()
raise
async def _mark_duplicate_to(self, master_uuid: str, candidate: dict, confidence: float):
"""Mark candidate as duplicate of a Qdrant-matched master."""
try:
self.db.execute(text("""
UPDATE canonical_controls
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
WHERE id = CAST(:cand AS uuid)
"""), {"master": master_uuid, "cand": candidate["uuid"]})
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence)
VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {"master": master_uuid, "cand_parent": candidate["uuid"], "conf": confidence})
transferred = self._transfer_parent_links(master_uuid, candidate["uuid"])
self.stats["parent_links_transferred"] += transferred
self.db.commit()
except Exception as e:
logger.error("BatchDedup _mark_duplicate_to error %s%s: %s",
candidate["uuid"], master_uuid, e)
self.db.rollback()
raise
def _transfer_parent_links(self, master_uuid: str, duplicate_uuid: str) -> int:
"""Move existing parent links from duplicate to master."""
rows = self.db.execute(text("""
SELECT parent_control_uuid::text, link_type, confidence,
source_regulation, source_article, obligation_candidate_id::text
FROM control_parent_links
WHERE control_uuid = CAST(:dup AS uuid)
AND link_type = 'decomposition'
"""), {"dup": duplicate_uuid}).fetchall()
transferred = 0
for r in rows:
parent_uuid = r[0]
if parent_uuid == master_uuid:
continue
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence,
source_regulation, source_article, obligation_candidate_id)
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), :lt, :conf,
:sr, :sa, CAST(:oci AS uuid))
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {
"cu": master_uuid,
"pu": parent_uuid,
"lt": r[1],
"conf": float(r[2]) if r[2] else 1.0,
"sr": r[3],
"sa": r[4],
"oci": r[5],
})
transferred += 1
return transferred
def _write_review(self, candidate: dict, matched_payload: dict, score: float):
"""Write a dedup review entry for borderline matches."""
try:
self.db.execute(text("""
INSERT INTO control_dedup_reviews
(candidate_control_id, candidate_title, candidate_objective,
matched_control_uuid, matched_control_id,
similarity_score, dedup_stage, dedup_details)
VALUES (:ccid, :ct, :co, CAST(:mcu AS uuid), :mci,
:ss, 'batch_dedup', CAST(:dd AS jsonb))
"""), {
"ccid": candidate["control_id"],
"ct": candidate["title"],
"co": candidate.get("objective", ""),
"mcu": matched_payload.get("control_uuid"),
"mci": matched_payload.get("control_id"),
"ss": score,
"dd": json.dumps({
"merge_group_hint": candidate.get("merge_group_hint", ""),
"pattern_id": candidate.get("pattern_id"),
}),
})
self.db.commit()
except Exception as e:
logger.error("BatchDedup _write_review error: %s", e)
self.db.rollback()
raise
# ── Progress ─────────────────────────────────────────────────────────
def _log_progress(self, hint: str):
"""Log progress every 500 controls."""
if self._progress_count > 0 and self._progress_count % 500 == 0:
logger.info(
"BatchDedup [%s] %d/%d — masters=%d, linked=%d, review=%d",
self._progress_phase, self._progress_count, self._progress_total,
self.stats["masters"], self.stats["linked"], self.stats["review"],
)
def get_status(self) -> dict:
"""Return current progress stats (for status endpoint)."""
return {
"phase": self._progress_phase,
"progress": self._progress_count,
"total": self._progress_total,
**self.stats,
}
@@ -0,0 +1,438 @@
"""
Citation Backfill Service enrich existing controls with article/paragraph provenance.
3-tier matching strategy:
Tier 1 Hash match: sha256(source_original_text) RAG chunk lookup
Tier 2 Regex parse: split concatenated "DSGVO Art. 35" regulation + article
Tier 3 Ollama LLM: ask local LLM to identify article/paragraph from text
"""
import hashlib
import json
import logging
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
import httpx
from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
ALL_COLLECTIONS = [
"bp_compliance_ce",
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_dsfa_corpus",
"bp_legal_templates",
]
BACKFILL_SYSTEM_PROMPT = (
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
)
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
_SOURCE_ARTICLE_RE = re.compile(
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
)
@dataclass
class MatchResult:
article: str
paragraph: str
method: str # "hash", "regex", "llm"
@dataclass
class BackfillResult:
total_controls: int = 0
matched_hash: int = 0
matched_regex: int = 0
matched_llm: int = 0
unmatched: int = 0
updated: int = 0
errors: list = field(default_factory=list)
class CitationBackfill:
"""Backfill article/paragraph into existing control source_citations."""
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
self.db = db
self.rag = rag_client
self._rag_index: dict[str, RAGSearchResult] = {}
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
result = BackfillResult()
# Load controls needing backfill
controls = self._load_controls_needing_backfill(limit)
result.total_controls = len(controls)
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
if not controls:
return result
# Collect hashes we need to find — only build index for controls with source text
needed_hashes: set[str] = set()
for ctrl in controls:
src = ctrl.get("source_original_text")
if src:
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
if needed_hashes:
# Build targeted RAG index — only scroll collections that our controls reference
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
await self._build_rag_index_targeted(controls)
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
else:
logger.info("No source_original_text found — skipping RAG index build")
# Process each control
for i, ctrl in enumerate(controls):
if i > 0 and i % 100 == 0:
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
try:
match = await self._match_control(ctrl)
if match:
if match.method == "hash":
result.matched_hash += 1
elif match.method == "regex":
result.matched_regex += 1
elif match.method == "llm":
result.matched_llm += 1
if not dry_run:
self._update_control(ctrl, match)
result.updated += 1
else:
logger.debug(
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
ctrl["control_id"], match.article, match.paragraph, match.method,
)
else:
result.unmatched += 1
except Exception as e:
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
logger.error(error_msg)
result.errors.append(error_msg)
if not dry_run:
try:
self.db.commit()
except Exception as e:
logger.error("Backfill commit failed: %s", e)
result.errors.append(f"Commit failed: {e}")
logger.info(
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
result.total_controls, result.matched_hash, result.matched_regex,
result.matched_llm, result.unmatched, result.updated,
)
return result
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
"""Load controls where source_citation exists but lacks separate 'article' key."""
query = """
SELECT id, control_id, source_citation, source_original_text,
generation_metadata, license_rule
FROM canonical_controls
WHERE license_rule IN (1, 2)
AND source_citation IS NOT NULL
AND (
source_citation->>'article' IS NULL
OR source_citation->>'article' = ''
)
ORDER BY control_id
"""
if limit > 0:
query += f" LIMIT {limit}"
result = self.db.execute(text(query))
cols = result.keys()
controls = []
for row in result:
ctrl = dict(zip(cols, row))
ctrl["id"] = str(ctrl["id"])
# Parse JSON fields
for jf in ("source_citation", "generation_metadata"):
if isinstance(ctrl.get(jf), str):
try:
ctrl[jf] = json.loads(ctrl[jf])
except (json.JSONDecodeError, TypeError):
ctrl[jf] = {}
controls.append(ctrl)
return controls
async def _build_rag_index_targeted(self, controls: list[dict]):
"""Build RAG index by scrolling only collections relevant to our controls.
Uses regulation codes from generation_metadata to identify which collections
to search, falling back to all collections only if needed.
"""
# Determine which collections are relevant based on regulation codes
regulation_to_collection = self._map_regulations_to_collections(controls)
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
logger.info("Targeted index: searching %d collections: %s",
len(collections_to_search), ", ".join(collections_to_search))
for collection in collections_to_search:
offset = None
page = 0
seen_offsets: set[str] = set()
while True:
chunks, next_offset = await self.rag.scroll(
collection=collection, offset=offset, limit=200,
)
if not chunks:
break
for chunk in chunks:
if chunk.text and len(chunk.text.strip()) >= 50:
h = hashlib.sha256(chunk.text.encode()).hexdigest()
self._rag_index[h] = chunk
page += 1
if page % 50 == 0:
logger.info("Indexing %s: page %d (%d chunks so far)",
collection, page, len(self._rag_index))
if not next_offset:
break
if next_offset in seen_offsets:
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
break
seen_offsets.add(next_offset)
offset = next_offset
logger.info("Indexed collection %s: %d pages", collection, page)
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
"""Map regulation codes from controls to likely Qdrant collections."""
# Heuristic: regulation code prefix → collection
collection_map = {
"eu_": "bp_compliance_gesetze",
"dsgvo": "bp_compliance_datenschutz",
"bdsg": "bp_compliance_gesetze",
"ttdsg": "bp_compliance_gesetze",
"nist_": "bp_compliance_ce",
"owasp": "bp_compliance_ce",
"bsi_": "bp_compliance_ce",
"enisa": "bp_compliance_ce",
"at_": "bp_compliance_recht",
"fr_": "bp_compliance_recht",
"es_": "bp_compliance_recht",
}
result: dict[str, str] = {}
for ctrl in controls:
meta = ctrl.get("generation_metadata") or {}
reg = meta.get("source_regulation", "")
if not reg:
continue
for prefix, coll in collection_map.items():
if reg.startswith(prefix):
result[reg] = coll
break
else:
# Unknown regulation — search all
for coll in ALL_COLLECTIONS:
result[f"_all_{coll}"] = coll
return result
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
"""3-tier matching: hash → regex → LLM."""
# Tier 1: Hash match against RAG index
source_text = ctrl.get("source_original_text")
if source_text:
h = hashlib.sha256(source_text.encode()).hexdigest()
chunk = self._rag_index.get(h)
if chunk and (chunk.article or chunk.paragraph):
return MatchResult(
article=chunk.article or "",
paragraph=chunk.paragraph or "",
method="hash",
)
# Tier 2: Regex parse concatenated source
citation = ctrl.get("source_citation") or {}
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed and parsed["article"]:
return MatchResult(
article=parsed["article"],
paragraph="", # Regex can't extract paragraph from concatenated format
method="regex",
)
# Tier 3: Ollama LLM
if source_text:
return await self._llm_match(ctrl)
return None
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
"""Use Ollama to identify article/paragraph from source text."""
citation = ctrl.get("source_citation") or {}
regulation_name = citation.get("source", "")
metadata = ctrl.get("generation_metadata") or {}
regulation_code = metadata.get("source_regulation", "")
source_text = ctrl.get("source_original_text", "")
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
Gesetz: {regulation_name} (Code: {regulation_code})
Text:
---
{source_text[:2000]}
---
Antworte NUR mit JSON:
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
Falls kein Artikel erkennbar ist, setze article auf "".
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
try:
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
data = _parse_json(raw)
if data and (data.get("article") or data.get("paragraph")):
return MatchResult(
article=data.get("article", ""),
paragraph=data.get("paragraph", ""),
method="llm",
)
except Exception as e:
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
return None
def _update_control(self, ctrl: dict, match: MatchResult):
"""Update source_citation and generation_metadata in DB."""
citation = ctrl.get("source_citation") or {}
# Clean the source name: remove concatenated article if present
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed:
citation["source"] = parsed["name"]
# Add separate article/paragraph fields
citation["article"] = match.article
citation["paragraph"] = match.paragraph
# Update generation_metadata
metadata = ctrl.get("generation_metadata") or {}
if match.article:
metadata["source_article"] = match.article
metadata["source_paragraph"] = match.paragraph
metadata["backfill_method"] = match.method
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
self.db.execute(
text("""
UPDATE canonical_controls
SET source_citation = :citation,
generation_metadata = :metadata,
updated_at = NOW()
WHERE id = CAST(:id AS uuid)
"""),
{
"id": ctrl["id"],
"citation": json.dumps(citation),
"metadata": json.dumps(metadata),
},
)
def _parse_concatenated_source(source: str) -> Optional[dict]:
"""Parse 'DSGVO Art. 35'{name: 'DSGVO', article: 'Art. 35'}.
Also handles '§' format: 'BDSG § 42' {name: 'BDSG', article: '§ 42'}.
"""
if not source:
return None
# Try Art./Artikel pattern
m = _SOURCE_ARTICLE_RE.match(source)
if m:
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
# Try § pattern
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
if m2:
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
return None
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"""Call Ollama chat API for backfill matching."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 256},
"think": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
if resp.status_code != 200:
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
return ""
data = resp.json()
msg = data.get("message", {})
if isinstance(msg, dict):
return msg.get("content", "")
return data.get("response", str(msg))
except Exception as e:
logger.error("Ollama backfill request failed: %s", e)
return ""
def _parse_json(raw: str) -> Optional[dict]:
"""Extract JSON object from LLM output."""
if not raw:
return None
# Try direct parse
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
if m:
try:
return json.loads(m.group(1))
except json.JSONDecodeError:
pass
# Try finding first { ... }
m = re.search(r"\{[^{}]*\}", raw)
if m:
try:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
return None
@@ -0,0 +1,546 @@
"""Control Composer — Pattern + Obligation → Master Control.
Takes an obligation (from ObligationExtractor) and a matched control pattern
(from PatternMatcher), then uses LLM to compose a structured, actionable
Master Control. Replaces the old Stage 3 (STRUCTURE/REFORM) with a
pattern-guided approach.
Three composition modes based on license rules:
Rule 1: Obligation + Pattern + original text full control
Rule 2: Obligation + Pattern + original text + citation control
Rule 3: Obligation + Pattern (NO original text) reformulated control
Fallback: No pattern match basic generation (tagged needs_pattern_assignment)
Part of the Multi-Layer Control Architecture (Phase 6 of 8).
"""
import json
import logging
import os
from dataclasses import dataclass, field
from typing import Optional
from compliance.services.obligation_extractor import (
ObligationMatch,
_llm_ollama,
_parse_json,
)
from compliance.services.pattern_matcher import (
ControlPattern,
PatternMatchResult,
)
logger = logging.getLogger(__name__)
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
# Valid values for generated control fields
VALID_SEVERITIES = {"low", "medium", "high", "critical"}
VALID_EFFORTS = {"s", "m", "l", "xl"}
VALID_VERIFICATION = {"code_review", "document", "tool", "hybrid"}
@dataclass
class ComposedControl:
"""A Master Control composed from an obligation + pattern."""
# Core fields (match canonical_controls schema)
control_id: str = ""
title: str = ""
objective: str = ""
rationale: str = ""
scope: dict = field(default_factory=dict)
requirements: list = field(default_factory=list)
test_procedure: list = field(default_factory=list)
evidence: list = field(default_factory=list)
severity: str = "medium"
risk_score: float = 5.0
implementation_effort: str = "m"
open_anchors: list = field(default_factory=list)
release_state: str = "draft"
tags: list = field(default_factory=list)
# 3-Rule License fields
license_rule: Optional[int] = None
source_original_text: Optional[str] = None
source_citation: Optional[dict] = None
customer_visible: bool = True
# Classification
verification_method: Optional[str] = None
category: Optional[str] = None
target_audience: Optional[list] = None
# Pattern + Obligation linkage
pattern_id: Optional[str] = None
obligation_ids: list = field(default_factory=list)
# Metadata
generation_metadata: dict = field(default_factory=dict)
composition_method: str = "pattern_guided" # pattern_guided | fallback
def to_dict(self) -> dict:
"""Serialize for DB storage or API response."""
return {
"control_id": self.control_id,
"title": self.title,
"objective": self.objective,
"rationale": self.rationale,
"scope": self.scope,
"requirements": self.requirements,
"test_procedure": self.test_procedure,
"evidence": self.evidence,
"severity": self.severity,
"risk_score": self.risk_score,
"implementation_effort": self.implementation_effort,
"open_anchors": self.open_anchors,
"release_state": self.release_state,
"tags": self.tags,
"license_rule": self.license_rule,
"source_original_text": self.source_original_text,
"source_citation": self.source_citation,
"customer_visible": self.customer_visible,
"verification_method": self.verification_method,
"category": self.category,
"target_audience": self.target_audience,
"pattern_id": self.pattern_id,
"obligation_ids": self.obligation_ids,
"generation_metadata": self.generation_metadata,
"composition_method": self.composition_method,
}
class ControlComposer:
"""Composes Master Controls from obligations + patterns.
Usage::
composer = ControlComposer()
control = await composer.compose(
obligation=obligation_match,
pattern_result=pattern_match_result,
chunk_text="...",
license_rule=1,
source_citation={...},
)
"""
async def compose(
self,
obligation: ObligationMatch,
pattern_result: PatternMatchResult,
chunk_text: Optional[str] = None,
license_rule: int = 3,
source_citation: Optional[dict] = None,
regulation_code: Optional[str] = None,
) -> ComposedControl:
"""Compose a Master Control from obligation + pattern.
Args:
obligation: The extracted obligation (from ObligationExtractor).
pattern_result: The matched pattern (from PatternMatcher).
chunk_text: Original RAG chunk text (only used for Rules 1-2).
license_rule: 1=free, 2=citation, 3=restricted.
source_citation: Citation metadata for Rule 2.
regulation_code: Source regulation code.
Returns:
ComposedControl ready for storage.
"""
pattern = pattern_result.pattern if pattern_result else None
if pattern:
control = await self._compose_with_pattern(
obligation, pattern, chunk_text, license_rule, source_citation,
)
else:
control = await self._compose_fallback(
obligation, chunk_text, license_rule, source_citation,
)
# Set linkage fields
control.pattern_id = pattern.id if pattern else None
if obligation.obligation_id:
control.obligation_ids = [obligation.obligation_id]
# Set license fields
control.license_rule = license_rule
if license_rule in (1, 2) and chunk_text:
control.source_original_text = chunk_text
if license_rule == 2 and source_citation:
control.source_citation = source_citation
if license_rule == 3:
control.customer_visible = False
control.source_original_text = None
control.source_citation = None
# Build metadata
control.generation_metadata = {
"composition_method": control.composition_method,
"pattern_id": control.pattern_id,
"pattern_confidence": round(pattern_result.confidence, 3) if pattern_result else 0,
"pattern_method": pattern_result.method if pattern_result else "none",
"obligation_id": obligation.obligation_id,
"obligation_method": obligation.method,
"obligation_confidence": round(obligation.confidence, 3),
"license_rule": license_rule,
"regulation_code": regulation_code,
}
# Validate and fix fields
_validate_control(control)
return control
async def compose_batch(
self,
items: list[dict],
) -> list[ComposedControl]:
"""Compose multiple controls.
Args:
items: List of dicts with keys: obligation, pattern_result,
chunk_text, license_rule, source_citation, regulation_code.
Returns:
List of ComposedControl instances.
"""
results = []
for item in items:
control = await self.compose(
obligation=item["obligation"],
pattern_result=item.get("pattern_result", PatternMatchResult()),
chunk_text=item.get("chunk_text"),
license_rule=item.get("license_rule", 3),
source_citation=item.get("source_citation"),
regulation_code=item.get("regulation_code"),
)
results.append(control)
return results
# -----------------------------------------------------------------------
# Pattern-guided composition
# -----------------------------------------------------------------------
async def _compose_with_pattern(
self,
obligation: ObligationMatch,
pattern: ControlPattern,
chunk_text: Optional[str],
license_rule: int,
source_citation: Optional[dict],
) -> ComposedControl:
"""Use LLM to fill the pattern template with obligation-specific details."""
prompt = _build_compose_prompt(obligation, pattern, chunk_text, license_rule)
system_prompt = _compose_system_prompt(license_rule)
llm_result = await _llm_ollama(prompt, system_prompt)
if not llm_result:
return self._compose_from_template(obligation, pattern)
parsed = _parse_json(llm_result)
if not parsed:
return self._compose_from_template(obligation, pattern)
control = ComposedControl(
title=parsed.get("title", pattern.name_de)[:255],
objective=parsed.get("objective", pattern.objective_template),
rationale=parsed.get("rationale", pattern.rationale_template),
requirements=_ensure_list(parsed.get("requirements", pattern.requirements_template)),
test_procedure=_ensure_list(parsed.get("test_procedure", pattern.test_procedure_template)),
evidence=_ensure_list(parsed.get("evidence", pattern.evidence_template)),
severity=parsed.get("severity", pattern.severity_default),
implementation_effort=parsed.get("implementation_effort", pattern.implementation_effort_default),
category=parsed.get("category", pattern.category),
tags=_ensure_list(parsed.get("tags", pattern.tags)),
target_audience=_ensure_list(parsed.get("target_audience", [])),
verification_method=parsed.get("verification_method"),
open_anchors=_anchors_from_pattern(pattern),
composition_method="pattern_guided",
)
return control
def _compose_from_template(
self,
obligation: ObligationMatch,
pattern: ControlPattern,
) -> ComposedControl:
"""Fallback: fill template directly without LLM (when LLM fails)."""
obl_title = obligation.obligation_title or ""
obl_text = obligation.obligation_text or ""
title = f"{pattern.name_de}"
if obl_title:
title = f"{pattern.name_de}{obl_title}"
objective = pattern.objective_template
if obl_text and len(obl_text) > 20:
objective = f"{pattern.objective_template} Bezug: {obl_text[:200]}"
return ComposedControl(
title=title[:255],
objective=objective,
rationale=pattern.rationale_template,
requirements=list(pattern.requirements_template),
test_procedure=list(pattern.test_procedure_template),
evidence=list(pattern.evidence_template),
severity=pattern.severity_default,
implementation_effort=pattern.implementation_effort_default,
category=pattern.category,
tags=list(pattern.tags),
open_anchors=_anchors_from_pattern(pattern),
composition_method="template_only",
)
# -----------------------------------------------------------------------
# Fallback (no pattern)
# -----------------------------------------------------------------------
async def _compose_fallback(
self,
obligation: ObligationMatch,
chunk_text: Optional[str],
license_rule: int,
source_citation: Optional[dict],
) -> ComposedControl:
"""Generate a control without a pattern template (old-style)."""
prompt = _build_fallback_prompt(obligation, chunk_text, license_rule)
system_prompt = _compose_system_prompt(license_rule)
llm_result = await _llm_ollama(prompt, system_prompt)
parsed = _parse_json(llm_result) if llm_result else {}
obl_text = obligation.obligation_text or ""
control = ComposedControl(
title=parsed.get("title", obl_text[:100] if obl_text else "Untitled Control")[:255],
objective=parsed.get("objective", obl_text[:500]),
rationale=parsed.get("rationale", "Aus gesetzlicher Pflicht abgeleitet."),
requirements=_ensure_list(parsed.get("requirements", [])),
test_procedure=_ensure_list(parsed.get("test_procedure", [])),
evidence=_ensure_list(parsed.get("evidence", [])),
severity=parsed.get("severity", "medium"),
implementation_effort=parsed.get("implementation_effort", "m"),
category=parsed.get("category"),
tags=_ensure_list(parsed.get("tags", [])),
target_audience=_ensure_list(parsed.get("target_audience", [])),
verification_method=parsed.get("verification_method"),
composition_method="fallback",
release_state="needs_review",
)
return control
# ---------------------------------------------------------------------------
# Prompt builders
# ---------------------------------------------------------------------------
def _compose_system_prompt(license_rule: int) -> str:
"""Build the system prompt based on license rule."""
if license_rule == 3:
return (
"Du bist ein Security-Compliance-Experte. Deine Aufgabe ist es, "
"eigenstaendige Security Controls zu formulieren. "
"Du formulierst IMMER in eigenen Worten. "
"KOPIERE KEINE Saetze aus dem Quelltext. "
"Verwende eigene Begriffe und Struktur. "
"NENNE NICHT die Quelle. Keine proprietaeren Bezeichner. "
"Antworte NUR mit validem JSON."
)
return (
"Du bist ein Security-Compliance-Experte. "
"Erstelle ein praxisorientiertes, umsetzbares Security Control. "
"Antworte NUR mit validem JSON."
)
def _build_compose_prompt(
obligation: ObligationMatch,
pattern: ControlPattern,
chunk_text: Optional[str],
license_rule: int,
) -> str:
"""Build the LLM prompt for pattern-guided composition."""
obl_section = _obligation_section(obligation)
pattern_section = _pattern_section(pattern)
if license_rule == 3:
context_section = "KONTEXT: Intern analysiert (keine Quellenangabe)."
elif chunk_text:
context_section = f"KONTEXT (Originaltext):\n{chunk_text[:2000]}"
else:
context_section = "KONTEXT: Kein Originaltext verfuegbar."
return f"""Erstelle ein PRAXISORIENTIERTES Security Control.
{obl_section}
{pattern_section}
{context_section}
AUFGABE:
Fuelle das Muster mit pflicht-spezifischen Details.
Das Ergebnis muss UMSETZBAR sein keine Gesetzesparaphrase.
Formuliere konkret und handlungsorientiert.
Antworte als JSON:
{{
"title": "Kurzer praegnanter Titel (max 100 Zeichen, deutsch)",
"objective": "Was soll erreicht werden? (1-3 Saetze)",
"rationale": "Warum ist das wichtig? (1-2 Saetze)",
"requirements": ["Konkrete Anforderung 1", "Anforderung 2", ...],
"test_procedure": ["Pruefschritt 1", "Pruefschritt 2", ...],
"evidence": ["Nachweis 1", "Nachweis 2", ...],
"severity": "low|medium|high|critical",
"implementation_effort": "s|m|l|xl",
"category": "{pattern.category}",
"tags": ["tag1", "tag2"],
"target_audience": ["unternehmen", "behoerden", "entwickler"],
"verification_method": "code_review|document|tool|hybrid"
}}"""
def _build_fallback_prompt(
obligation: ObligationMatch,
chunk_text: Optional[str],
license_rule: int,
) -> str:
"""Build the LLM prompt for fallback composition (no pattern)."""
obl_section = _obligation_section(obligation)
if license_rule == 3:
context_section = "KONTEXT: Intern analysiert (keine Quellenangabe)."
elif chunk_text:
context_section = f"KONTEXT (Originaltext):\n{chunk_text[:2000]}"
else:
context_section = "KONTEXT: Kein Originaltext verfuegbar."
return f"""Erstelle ein Security Control aus der folgenden Pflicht.
{obl_section}
{context_section}
AUFGABE:
Formuliere ein umsetzbares Security Control.
Keine Gesetzesparaphrase konkrete Massnahmen beschreiben.
Antworte als JSON:
{{
"title": "Kurzer praegnanter Titel (max 100 Zeichen, deutsch)",
"objective": "Was soll erreicht werden? (1-3 Saetze)",
"rationale": "Warum ist das wichtig? (1-2 Saetze)",
"requirements": ["Konkrete Anforderung 1", "Anforderung 2", ...],
"test_procedure": ["Pruefschritt 1", "Pruefschritt 2", ...],
"evidence": ["Nachweis 1", "Nachweis 2", ...],
"severity": "low|medium|high|critical",
"implementation_effort": "s|m|l|xl",
"category": "one of: authentication, encryption, data_protection, etc.",
"tags": ["tag1", "tag2"],
"target_audience": ["unternehmen"],
"verification_method": "code_review|document|tool|hybrid"
}}"""
def _obligation_section(obligation: ObligationMatch) -> str:
"""Format the obligation for the prompt."""
parts = ["PFLICHT (was das Gesetz verlangt):"]
if obligation.obligation_title:
parts.append(f" Titel: {obligation.obligation_title}")
if obligation.obligation_text:
parts.append(f" Beschreibung: {obligation.obligation_text[:500]}")
if obligation.obligation_id:
parts.append(f" ID: {obligation.obligation_id}")
if obligation.regulation_id:
parts.append(f" Rechtsgrundlage: {obligation.regulation_id}")
if not obligation.obligation_text and not obligation.obligation_title:
parts.append(" (Keine spezifische Pflicht extrahiert)")
return "\n".join(parts)
def _pattern_section(pattern: ControlPattern) -> str:
"""Format the pattern for the prompt."""
reqs = "\n ".join(f"- {r}" for r in pattern.requirements_template[:5])
tests = "\n ".join(f"- {t}" for t in pattern.test_procedure_template[:3])
return f"""MUSTER (wie man es typischerweise umsetzt):
Pattern: {pattern.name_de} ({pattern.id})
Domain: {pattern.domain}
Ziel-Template: {pattern.objective_template}
Anforderungs-Template:
{reqs}
Pruefverfahren-Template:
{tests}"""
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _ensure_list(value) -> list:
"""Ensure a value is a list of strings."""
if isinstance(value, list):
return [str(v) for v in value if v]
if isinstance(value, str):
return [value]
return []
def _anchors_from_pattern(pattern: ControlPattern) -> list:
"""Convert pattern's open_anchor_refs to control anchor format."""
anchors = []
for ref in pattern.open_anchor_refs:
anchors.append({
"framework": ref.get("framework", ""),
"control_id": ref.get("ref", ""),
"title": "",
"alignment_score": 0.8,
})
return anchors
def _validate_control(control: ComposedControl) -> None:
"""Validate and fix control field values."""
# Severity
if control.severity not in VALID_SEVERITIES:
control.severity = "medium"
# Implementation effort
if control.implementation_effort not in VALID_EFFORTS:
control.implementation_effort = "m"
# Verification method
if control.verification_method and control.verification_method not in VALID_VERIFICATION:
control.verification_method = None
# Risk score
if not (0 <= control.risk_score <= 10):
control.risk_score = _severity_to_risk(control.severity)
# Title length
if len(control.title) > 255:
control.title = control.title[:252] + "..."
# Ensure minimum content
if not control.objective:
control.objective = control.title
if not control.rationale:
control.rationale = "Aus regulatorischer Anforderung abgeleitet."
if not control.requirements:
control.requirements = ["Anforderung gemaess Pflichtbeschreibung umsetzen"]
if not control.test_procedure:
control.test_procedure = ["Umsetzung der Anforderungen pruefen"]
if not control.evidence:
control.evidence = ["Dokumentation der Umsetzung"]
def _severity_to_risk(severity: str) -> float:
"""Map severity to a default risk score."""
return {
"critical": 9.0,
"high": 7.0,
"medium": 5.0,
"low": 3.0,
}.get(severity, 5.0)
@@ -0,0 +1,745 @@
"""Control Deduplication Engine — 4-Stage Matching Pipeline.
Prevents duplicate atomic controls during Pass 0b by checking candidates
against existing controls before insertion.
Stages:
1. Pattern-Gate: pattern_id must match (hard gate)
2. Action-Check: normalized action verb must match (hard gate)
3. Object-Norm: normalized object must match (soft gate with high threshold)
4. Embedding: cosine similarity with tiered thresholds (Qdrant)
Verdicts:
- NEW: create a new atomic control
- LINK: add parent link to existing control (similarity > LINK_THRESHOLD)
- REVIEW: queue for human review (REVIEW_THRESHOLD < sim < LINK_THRESHOLD)
"""
import logging
import os
import re
from dataclasses import dataclass, field
from typing import Optional, Callable, Awaitable
import httpx
logger = logging.getLogger(__name__)
# ── Configuration ────────────────────────────────────────────────────
DEDUP_ENABLED = os.getenv("DEDUP_ENABLED", "true").lower() == "true"
LINK_THRESHOLD = float(os.getenv("DEDUP_LINK_THRESHOLD", "0.92"))
REVIEW_THRESHOLD = float(os.getenv("DEDUP_REVIEW_THRESHOLD", "0.85"))
LINK_THRESHOLD_DIFF_OBJECT = float(os.getenv("DEDUP_LINK_THRESHOLD_DIFF_OBJ", "0.95"))
CROSS_REG_LINK_THRESHOLD = float(os.getenv("DEDUP_CROSS_REG_THRESHOLD", "0.95"))
QDRANT_COLLECTION = os.getenv("DEDUP_QDRANT_COLLECTION", "atomic_controls")
QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
# ── Result Dataclass ─────────────────────────────────────────────────
@dataclass
class DedupResult:
"""Outcome of the dedup check."""
verdict: str # "new" | "link" | "review"
matched_control_uuid: Optional[str] = None
matched_control_id: Optional[str] = None
matched_title: Optional[str] = None
stage: str = "" # which stage decided
similarity_score: float = 0.0
link_type: str = "dedup_merge" # "dedup_merge" | "cross_regulation"
details: dict = field(default_factory=dict)
# ── Action Normalization ─────────────────────────────────────────────
_ACTION_SYNONYMS: dict[str, str] = {
# German → canonical English
"implementieren": "implement",
"umsetzen": "implement",
"einrichten": "implement",
"einführen": "implement",
"aufbauen": "implement",
"bereitstellen": "implement",
"aktivieren": "implement",
"konfigurieren": "configure",
"einstellen": "configure",
"parametrieren": "configure",
"testen": "test",
"prüfen": "test",
"überprüfen": "test",
"verifizieren": "test",
"validieren": "test",
"kontrollieren": "test",
"auditieren": "audit",
"dokumentieren": "document",
"protokollieren": "log",
"aufzeichnen": "log",
"loggen": "log",
"überwachen": "monitor",
"monitoring": "monitor",
"beobachten": "monitor",
"schulen": "train",
"trainieren": "train",
"sensibilisieren": "train",
"löschen": "delete",
"entfernen": "delete",
"verschlüsseln": "encrypt",
"sperren": "block",
"beschränken": "restrict",
"einschränken": "restrict",
"begrenzen": "restrict",
"autorisieren": "authorize",
"genehmigen": "authorize",
"freigeben": "authorize",
"authentifizieren": "authenticate",
"identifizieren": "identify",
"melden": "report",
"benachrichtigen": "notify",
"informieren": "notify",
"aktualisieren": "update",
"erneuern": "update",
"sichern": "backup",
"wiederherstellen": "restore",
# English passthrough
"implement": "implement",
"configure": "configure",
"test": "test",
"verify": "test",
"validate": "test",
"audit": "audit",
"document": "document",
"log": "log",
"monitor": "monitor",
"train": "train",
"delete": "delete",
"encrypt": "encrypt",
"restrict": "restrict",
"authorize": "authorize",
"authenticate": "authenticate",
"report": "report",
"update": "update",
"backup": "backup",
"restore": "restore",
}
def normalize_action(action: str) -> str:
"""Normalize an action verb to a canonical English form."""
if not action:
return ""
action = action.strip().lower()
# Strip German infinitive/conjugation suffixes for lookup
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
# Try exact match first, then base form
if action in _ACTION_SYNONYMS:
return _ACTION_SYNONYMS[action]
if action_base in _ACTION_SYNONYMS:
return _ACTION_SYNONYMS[action_base]
# Fuzzy: check if action starts with any known verb
for verb, canonical in _ACTION_SYNONYMS.items():
if action.startswith(verb) or verb.startswith(action):
return canonical
return action # fallback: return as-is
# ── Object Normalization ─────────────────────────────────────────────
_OBJECT_SYNONYMS: dict[str, str] = {
# Authentication / Access
"mfa": "multi_factor_auth",
"multi-faktor-authentifizierung": "multi_factor_auth",
"mehrfaktorauthentifizierung": "multi_factor_auth",
"multi-factor authentication": "multi_factor_auth",
"two-factor": "multi_factor_auth",
"2fa": "multi_factor_auth",
"passwort": "password_policy",
"kennwort": "password_policy",
"password": "password_policy",
"zugangsdaten": "credentials",
"credentials": "credentials",
"admin-konten": "privileged_access",
"admin accounts": "privileged_access",
"administratorkonten": "privileged_access",
"privilegierte zugriffe": "privileged_access",
"privileged accounts": "privileged_access",
"remote-zugriff": "remote_access",
"fernzugriff": "remote_access",
"remote access": "remote_access",
"session": "session_management",
"sitzung": "session_management",
"sitzungsverwaltung": "session_management",
# Encryption
"verschlüsselung": "encryption",
"encryption": "encryption",
"kryptografie": "encryption",
"kryptografische verfahren": "encryption",
"schlüssel": "key_management",
"key management": "key_management",
"schlüsselverwaltung": "key_management",
"zertifikat": "certificate_management",
"certificate": "certificate_management",
"tls": "transport_encryption",
"ssl": "transport_encryption",
"https": "transport_encryption",
# Network
"firewall": "firewall",
"netzwerk": "network_security",
"network": "network_security",
"vpn": "vpn",
"segmentierung": "network_segmentation",
"segmentation": "network_segmentation",
# Logging / Monitoring
"audit-log": "audit_logging",
"audit log": "audit_logging",
"protokoll": "audit_logging",
"logging": "audit_logging",
"monitoring": "monitoring",
"überwachung": "monitoring",
"alerting": "alerting",
"alarmierung": "alerting",
"siem": "siem",
# Data
"personenbezogene daten": "personal_data",
"personal data": "personal_data",
"sensible daten": "sensitive_data",
"sensitive data": "sensitive_data",
"datensicherung": "backup",
"backup": "backup",
"wiederherstellung": "disaster_recovery",
"disaster recovery": "disaster_recovery",
# Policy / Process
"richtlinie": "policy",
"policy": "policy",
"verfahrensanweisung": "procedure",
"procedure": "procedure",
"prozess": "process",
"schulung": "training",
"training": "training",
"awareness": "awareness",
"sensibilisierung": "awareness",
# Incident
"vorfall": "incident",
"incident": "incident",
"sicherheitsvorfall": "security_incident",
"security incident": "security_incident",
# Vulnerability
"schwachstelle": "vulnerability",
"vulnerability": "vulnerability",
"patch": "patch_management",
"update": "patch_management",
"patching": "patch_management",
}
# Precompile for substring matching (longest first)
_OBJECT_KEYS_SORTED = sorted(_OBJECT_SYNONYMS.keys(), key=len, reverse=True)
def normalize_object(obj: str) -> str:
"""Normalize a compliance object to a canonical token."""
if not obj:
return ""
obj_lower = obj.strip().lower()
# Exact match
if obj_lower in _OBJECT_SYNONYMS:
return _OBJECT_SYNONYMS[obj_lower]
# Substring match (longest first)
for phrase in _OBJECT_KEYS_SORTED:
if phrase in obj_lower:
return _OBJECT_SYNONYMS[phrase]
# Fallback: strip articles/prepositions, join with underscore
cleaned = re.sub(r"\b(der|die|das|den|dem|des|ein|eine|eines|einem|einen"
r"|für|von|zu|auf|in|an|bei|mit|nach|über|unter|the|a|an"
r"|for|of|to|on|in|at|by|with)\b", "", obj_lower)
tokens = [t for t in cleaned.split() if len(t) > 2]
return "_".join(tokens[:4]) if tokens else obj_lower.replace(" ", "_")
# ── Canonicalization ─────────────────────────────────────────────────
def canonicalize_text(action: str, obj: str, title: str = "") -> str:
"""Build a canonical English text for embedding.
Transforms German compliance text into normalized English tokens
for more stable embedding comparisons.
"""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
# Build canonical sentence
parts = [norm_action, norm_object]
if title:
# Add title keywords (stripped of common filler)
title_clean = re.sub(
r"\b(und|oder|für|von|zu|der|die|das|den|dem|des|ein|eine"
r"|bei|mit|nach|gemäß|gem\.|laut|entsprechend)\b",
"", title.lower()
)
title_tokens = [t for t in title_clean.split() if len(t) > 3][:5]
if title_tokens:
parts.append("for")
parts.extend(title_tokens)
return " ".join(parts)
# ── Embedding Helper ─────────────────────────────────────────────────
async def get_embedding(text: str) -> list[float]:
"""Get embedding vector for a single text via embedding service."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": [text]},
)
embeddings = resp.json().get("embeddings", [])
return embeddings[0] if embeddings else []
except Exception as e:
logger.warning("Embedding failed: %s", e)
return []
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
# ── Qdrant Helpers ───────────────────────────────────────────────────
async def qdrant_search(
embedding: list[float],
pattern_id: str,
top_k: int = 10,
collection: Optional[str] = None,
) -> list[dict]:
"""Search Qdrant for similar atomic controls, filtered by pattern_id."""
if not embedding:
return []
coll = collection or QDRANT_COLLECTION
body: dict = {
"vector": embedding,
"limit": top_k,
"with_payload": True,
"filter": {
"must": [
{"key": "pattern_id", "match": {"value": pattern_id}}
]
},
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{coll}/points/search",
json=body,
)
if resp.status_code != 200:
logger.warning("Qdrant search failed: %d", resp.status_code)
return []
return resp.json().get("result", [])
except Exception as e:
logger.warning("Qdrant search error: %s", e)
return []
async def qdrant_search_cross_regulation(
embedding: list[float],
top_k: int = 5,
collection: Optional[str] = None,
) -> list[dict]:
"""Search Qdrant for similar controls across ALL regulations (no pattern_id filter).
Used for cross-regulation linking (e.g. DSGVO Art. 25 NIS2 Art. 21).
"""
if not embedding:
return []
coll = collection or QDRANT_COLLECTION
body: dict = {
"vector": embedding,
"limit": top_k,
"with_payload": True,
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{coll}/points/search",
json=body,
)
if resp.status_code != 200:
logger.warning("Qdrant cross-reg search failed: %d", resp.status_code)
return []
return resp.json().get("result", [])
except Exception as e:
logger.warning("Qdrant cross-reg search error: %s", e)
return []
async def qdrant_upsert(
point_id: str,
embedding: list[float],
payload: dict,
collection: Optional[str] = None,
) -> bool:
"""Upsert a single point into a Qdrant collection."""
if not embedding:
return False
coll = collection or QDRANT_COLLECTION
body = {
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload,
}]
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.put(
f"{QDRANT_URL}/collections/{coll}/points",
json=body,
)
return resp.status_code == 200
except Exception as e:
logger.warning("Qdrant upsert error: %s", e)
return False
async def ensure_qdrant_collection(
vector_size: int = 1024,
collection: Optional[str] = None,
) -> bool:
"""Create a Qdrant collection if it doesn't exist (idempotent)."""
coll = collection or QDRANT_COLLECTION
try:
async with httpx.AsyncClient(timeout=10.0) as client:
# Check if exists
resp = await client.get(f"{QDRANT_URL}/collections/{coll}")
if resp.status_code == 200:
return True
# Create
resp = await client.put(
f"{QDRANT_URL}/collections/{coll}",
json={
"vectors": {"size": vector_size, "distance": "Cosine"},
},
)
if resp.status_code == 200:
logger.info("Created Qdrant collection: %s", coll)
# Create payload indexes
for field_name in ["pattern_id", "action_normalized", "object_normalized", "control_id"]:
await client.put(
f"{QDRANT_URL}/collections/{coll}/index",
json={"field_name": field_name, "field_schema": "keyword"},
)
return True
logger.error("Failed to create Qdrant collection: %d", resp.status_code)
return False
except Exception as e:
logger.warning("Qdrant collection check error: %s", e)
return False
# ── Main Dedup Checker ───────────────────────────────────────────────
class ControlDedupChecker:
"""4-stage dedup checker for atomic controls.
Usage:
checker = ControlDedupChecker(db_session)
result = await checker.check_duplicate(candidate_action, candidate_object, candidate_title, pattern_id)
if result.verdict == "link":
checker.add_parent_link(result.matched_control_uuid, parent_uuid)
elif result.verdict == "review":
checker.write_review(candidate, result)
else:
# Insert new control
"""
def __init__(
self,
db,
embed_fn: Optional[Callable[[str], Awaitable[list[float]]]] = None,
search_fn: Optional[Callable] = None,
):
self.db = db
self._embed = embed_fn or get_embedding
self._search = search_fn or qdrant_search
self._cache: dict[str, list[dict]] = {} # pattern_id → existing controls
def _load_existing(self, pattern_id: str) -> list[dict]:
"""Load existing atomic controls with same pattern_id from DB."""
if pattern_id in self._cache:
return self._cache[pattern_id]
from sqlalchemy import text
rows = self.db.execute(text("""
SELECT id::text, control_id, title, objective,
pattern_id,
generation_metadata->>'obligation_type' as obligation_type
FROM canonical_controls
WHERE parent_control_uuid IS NOT NULL
AND release_state != 'deprecated'
AND pattern_id = :pid
"""), {"pid": pattern_id}).fetchall()
result = [
{
"uuid": r[0], "control_id": r[1], "title": r[2],
"objective": r[3], "pattern_id": r[4],
"obligation_type": r[5],
}
for r in rows
]
self._cache[pattern_id] = result
return result
async def check_duplicate(
self,
action: str,
obj: str,
title: str,
pattern_id: Optional[str],
) -> DedupResult:
"""Run the 4-stage dedup pipeline + cross-regulation linking.
Returns DedupResult with verdict: new/link/review.
"""
# No pattern_id → can't dedup meaningfully
if not pattern_id:
return DedupResult(verdict="new", stage="no_pattern")
# Stage 1: Pattern-Gate
existing = self._load_existing(pattern_id)
if not existing:
return DedupResult(
verdict="new", stage="pattern_gate",
details={"reason": "no existing controls with this pattern_id"},
)
# Stage 2: Action-Check
norm_action = normalize_action(action)
# We don't have action stored on existing controls from DB directly,
# so we use embedding for controls that passed pattern gate.
# But we CAN check via generation_metadata if available.
# Stage 3: Object-Normalization
norm_object = normalize_object(obj)
# Stage 4: Embedding Similarity
canonical = canonicalize_text(action, obj, title)
embedding = await self._embed(canonical)
if not embedding:
# Can't compute embedding → default to new
return DedupResult(
verdict="new", stage="embedding_unavailable",
details={"canonical_text": canonical},
)
# Search Qdrant
results = await self._search(embedding, pattern_id, top_k=5)
if not results:
# No intra-pattern matches → try cross-regulation
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="no_qdrant_matches",
details={"canonical_text": canonical, "action": norm_action, "object": norm_object},
))
# Evaluate best match
best = results[0]
best_score = best.get("score", 0.0)
best_payload = best.get("payload", {})
best_action = best_payload.get("action_normalized", "")
best_object = best_payload.get("object_normalized", "")
# Action differs → NEW (even if embedding is high)
if best_action and norm_action and best_action != norm_action:
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="action_mismatch",
similarity_score=best_score,
matched_control_id=best_payload.get("control_id"),
details={
"candidate_action": norm_action,
"existing_action": best_action,
"similarity": best_score,
},
))
# Object differs → use higher threshold
if best_object and norm_object and best_object != norm_object:
if best_score > LINK_THRESHOLD_DIFF_OBJECT:
return DedupResult(
verdict="link", stage="embedding_diff_object",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
details={"candidate_object": norm_object, "existing_object": best_object},
)
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="object_mismatch_below_threshold",
similarity_score=best_score,
matched_control_id=best_payload.get("control_id"),
details={
"candidate_object": norm_object,
"existing_object": best_object,
"threshold": LINK_THRESHOLD_DIFF_OBJECT,
},
))
# Same action + same object → tiered thresholds
if best_score > LINK_THRESHOLD:
return DedupResult(
verdict="link", stage="embedding_match",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
)
if best_score > REVIEW_THRESHOLD:
return DedupResult(
verdict="review", stage="embedding_review",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
)
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="embedding_below_threshold",
similarity_score=best_score,
details={"threshold": REVIEW_THRESHOLD},
))
async def _check_cross_regulation(
self,
embedding: list[float],
intra_result: DedupResult,
) -> DedupResult:
"""Second pass: cross-regulation linking for controls deemed 'new'.
Searches Qdrant WITHOUT pattern_id filter. Uses a higher threshold
(0.95) to avoid false positives across regulation boundaries.
"""
if intra_result.verdict != "new" or not embedding:
return intra_result
cross_results = await qdrant_search_cross_regulation(embedding, top_k=5)
if not cross_results:
return intra_result
best = cross_results[0]
best_score = best.get("score", 0.0)
if best_score > CROSS_REG_LINK_THRESHOLD:
best_payload = best.get("payload", {})
return DedupResult(
verdict="link",
stage="cross_regulation",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
link_type="cross_regulation",
details={
"cross_reg_score": best_score,
"cross_reg_threshold": CROSS_REG_LINK_THRESHOLD,
},
)
return intra_result
def add_parent_link(
self,
control_uuid: str,
parent_control_uuid: str,
link_type: str = "dedup_merge",
confidence: float = 0.0,
source_regulation: Optional[str] = None,
source_article: Optional[str] = None,
obligation_candidate_id: Optional[str] = None,
) -> None:
"""Add a parent link to an existing atomic control."""
from sqlalchemy import text
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence,
source_regulation, source_article, obligation_candidate_id)
VALUES (:cu, :pu, :lt, :conf, :sr, :sa, :oci::uuid)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {
"cu": control_uuid,
"pu": parent_control_uuid,
"lt": link_type,
"conf": confidence,
"sr": source_regulation,
"sa": source_article,
"oci": obligation_candidate_id,
})
self.db.commit()
def write_review(
self,
candidate_control_id: str,
candidate_title: str,
candidate_objective: str,
result: DedupResult,
parent_control_uuid: Optional[str] = None,
obligation_candidate_id: Optional[str] = None,
) -> None:
"""Write a dedup review queue entry."""
from sqlalchemy import text
self.db.execute(text("""
INSERT INTO control_dedup_reviews
(candidate_control_id, candidate_title, candidate_objective,
matched_control_uuid, matched_control_id,
similarity_score, dedup_stage, dedup_details,
parent_control_uuid, obligation_candidate_id)
VALUES (:ccid, :ct, :co, :mcu::uuid, :mci, :ss, :ds,
:dd::jsonb, :pcu::uuid, :oci)
"""), {
"ccid": candidate_control_id,
"ct": candidate_title,
"co": candidate_objective,
"mcu": result.matched_control_uuid,
"mci": result.matched_control_id,
"ss": result.similarity_score,
"ds": result.stage,
"dd": __import__("json").dumps(result.details),
"pcu": parent_control_uuid,
"oci": obligation_candidate_id,
})
self.db.commit()
async def index_control(
self,
control_uuid: str,
control_id: str,
title: str,
action: str,
obj: str,
pattern_id: str,
collection: Optional[str] = None,
) -> bool:
"""Index a new atomic control in Qdrant for future dedup checks."""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
canonical = canonicalize_text(action, obj, title)
embedding = await self._embed(canonical)
if not embedding:
return False
return await qdrant_upsert(
point_id=control_uuid,
embedding=embedding,
payload={
"control_uuid": control_uuid,
"control_id": control_id,
"title": title,
"pattern_id": pattern_id,
"action_normalized": norm_action,
"object_normalized": norm_object,
"canonical_text": canonical,
},
collection=collection,
)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,152 @@
"""
Control Status Transition State Machine.
Enforces that controls cannot be set to "pass" without sufficient evidence.
Prevents Compliance-Theater where controls claim compliance without real proof.
Transition rules:
planned in_progress : always allowed
in_progress pass : requires 1 evidence with confidence E2 and
truth_status in (uploaded, observed, validated_internal)
in_progress partial : requires 1 evidence (any level)
pass fail : always allowed (degradation)
any n/a : requires status_justification
any planned : always allowed (reset)
"""
from typing import List, Optional, Tuple
from ..db.models import EvidenceDB
# Confidence level ordering for comparisons
CONFIDENCE_ORDER = {"E0": 0, "E1": 1, "E2": 2, "E3": 3, "E4": 4}
# Truth statuses that qualify as "real" evidence for pass transitions
VALID_TRUTH_STATUSES = {"uploaded", "observed", "validated_internal", "accepted_by_auditor", "provided_to_auditor"}
def validate_transition(
current_status: str,
new_status: str,
evidence_list: Optional[List[EvidenceDB]] = None,
status_justification: Optional[str] = None,
bypass_for_auto_updater: bool = False,
) -> Tuple[bool, List[str]]:
"""
Validate whether a control status transition is allowed.
Args:
current_status: Current control status value (e.g. "planned", "pass")
new_status: Requested new status
evidence_list: List of EvidenceDB objects linked to this control
status_justification: Text justification (required for n/a transitions)
bypass_for_auto_updater: If True, skip evidence checks (used by CI/CD auto-updater
which creates evidence atomically with status change)
Returns:
Tuple of (allowed: bool, violations: list[str])
"""
violations: List[str] = []
evidence_list = evidence_list or []
# Same status → no-op, always allowed
if current_status == new_status:
return True, []
# Reset to planned is always allowed
if new_status == "planned":
return True, []
# n/a requires justification
if new_status == "n/a":
if not status_justification or not status_justification.strip():
violations.append("Transition to 'n/a' requires a status_justification explaining why this control is not applicable.")
return len(violations) == 0, violations
# Degradation: pass → fail is always allowed
if current_status == "pass" and new_status == "fail":
return True, []
# planned → in_progress: always allowed
if current_status == "planned" and new_status == "in_progress":
return True, []
# in_progress → partial: needs at least 1 evidence
if new_status == "partial":
if not bypass_for_auto_updater and len(evidence_list) == 0:
violations.append("Transition to 'partial' requires at least 1 evidence record.")
return len(violations) == 0, violations
# in_progress → pass: strict requirements
if new_status == "pass":
if bypass_for_auto_updater:
return True, []
if len(evidence_list) == 0:
violations.append("Transition to 'pass' requires at least 1 evidence record.")
return False, violations
# Check for at least one qualifying evidence
has_qualifying = False
for e in evidence_list:
conf = getattr(e, "confidence_level", None)
truth = getattr(e, "truth_status", None)
# Get string values from enum or string
conf_val = conf.value if hasattr(conf, "value") else str(conf) if conf else "E1"
truth_val = truth.value if hasattr(truth, "value") else str(truth) if truth else "uploaded"
if CONFIDENCE_ORDER.get(conf_val, 1) >= CONFIDENCE_ORDER["E2"] and truth_val in VALID_TRUTH_STATUSES:
has_qualifying = True
break
if not has_qualifying:
violations.append(
"Transition to 'pass' requires at least 1 evidence with confidence >= E2 "
"and truth_status in (uploaded, observed, validated_internal, accepted_by_auditor). "
"Current evidence does not meet this threshold."
)
return len(violations) == 0, violations
# in_progress → fail: always allowed
if new_status == "fail":
return True, []
# Any other transition from planned/fail to pass requires going through in_progress
if current_status in ("planned", "fail") and new_status == "pass":
if bypass_for_auto_updater:
return True, []
violations.append(
f"Direct transition from '{current_status}' to 'pass' is not allowed. "
f"Move to 'in_progress' first, then to 'pass' with qualifying evidence."
)
return False, violations
# Default: allow other transitions (e.g. fail → partial, partial → pass)
# For partial → pass, apply the same evidence checks
if current_status == "partial" and new_status == "pass":
if bypass_for_auto_updater:
return True, []
has_qualifying = False
for e in evidence_list:
conf = getattr(e, "confidence_level", None)
truth = getattr(e, "truth_status", None)
conf_val = conf.value if hasattr(conf, "value") else str(conf) if conf else "E1"
truth_val = truth.value if hasattr(truth, "value") else str(truth) if truth else "uploaded"
if CONFIDENCE_ORDER.get(conf_val, 1) >= CONFIDENCE_ORDER["E2"] and truth_val in VALID_TRUTH_STATUSES:
has_qualifying = True
break
if not has_qualifying:
violations.append(
"Transition from 'partial' to 'pass' requires at least 1 evidence with confidence >= E2 "
"and truth_status in (uploaded, observed, validated_internal, accepted_by_auditor)."
)
return len(violations) == 0, violations
# All other transitions allowed
return True, []
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,714 @@
"""Framework Decomposition Engine — decomposes framework-container obligations.
Sits between Pass 0a (obligation extraction) and Pass 0b (atomic control
composition). Detects obligations that reference a framework domain (e.g.
"CCM-Praktiken fuer AIS") and decomposes them into concrete sub-obligations
using an internal framework registry.
Three routing types:
atomic pass through to Pass 0b unchanged
compound split compound verbs, then Pass 0b
framework_container decompose via registry, then Pass 0b
The registry is a set of JSON files under compliance/data/frameworks/.
"""
import json
import logging
import os
import re
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Registry loading
# ---------------------------------------------------------------------------
_REGISTRY_DIR = Path(__file__).resolve().parent.parent / "data" / "frameworks"
_REGISTRY: dict[str, dict] = {} # framework_id → framework dict
def _load_registry() -> dict[str, dict]:
"""Load all framework JSON files from the registry directory."""
registry: dict[str, dict] = {}
if not _REGISTRY_DIR.is_dir():
logger.warning("Framework registry dir not found: %s", _REGISTRY_DIR)
return registry
for fpath in sorted(_REGISTRY_DIR.glob("*.json")):
try:
with open(fpath, encoding="utf-8") as f:
fw = json.load(f)
fw_id = fw.get("framework_id", fpath.stem)
registry[fw_id] = fw
logger.info(
"Loaded framework: %s (%d domains)",
fw_id,
len(fw.get("domains", [])),
)
except Exception:
logger.exception("Failed to load framework file: %s", fpath)
return registry
def get_registry() -> dict[str, dict]:
"""Return the global framework registry (lazy-loaded)."""
global _REGISTRY
if not _REGISTRY:
_REGISTRY = _load_registry()
return _REGISTRY
def reload_registry() -> dict[str, dict]:
"""Force-reload the framework registry from disk."""
global _REGISTRY
_REGISTRY = _load_registry()
return _REGISTRY
# ---------------------------------------------------------------------------
# Framework alias index (built from registry)
# ---------------------------------------------------------------------------
def _build_alias_index(registry: dict[str, dict]) -> dict[str, str]:
"""Build a lowercase alias → framework_id lookup."""
idx: dict[str, str] = {}
for fw_id, fw in registry.items():
# Framework-level aliases
idx[fw_id.lower()] = fw_id
name = fw.get("display_name", "")
if name:
idx[name.lower()] = fw_id
# Common short forms
for part in fw_id.lower().replace("_", " ").split():
if len(part) >= 3:
idx[part] = fw_id
return idx
# ---------------------------------------------------------------------------
# Routing — classify obligation type
# ---------------------------------------------------------------------------
# Extended patterns for framework detection (beyond the simple _COMPOSITE_RE
# in decomposition_pass.py — here we also capture the framework name)
_FRAMEWORK_PATTERN = re.compile(
r"(?:praktiken|kontrollen|ma(?:ss|ß)nahmen|anforderungen|vorgaben|controls|practices|measures|requirements)"
r"\s+(?:f(?:ue|ü)r|aus|gem(?:ae|ä)(?:ss|ß)|nach|from|of|for|per)\s+"
r"(.+?)(?:\s+(?:m(?:ue|ü)ssen|sollen|sind|werden|implementieren|umsetzen|einf(?:ue|ü)hren)|\.|,|$)",
re.IGNORECASE,
)
# Direct framework name references
_DIRECT_FRAMEWORK_RE = re.compile(
r"\b(?:CSA\s*CCM|NIST\s*(?:SP\s*)?800-53|OWASP\s*(?:ASVS|SAMM|Top\s*10)"
r"|CIS\s*Controls|BSI\s*(?:IT-)?Grundschutz|ENISA|ISO\s*2700[12]"
r"|COBIT|SOX|PCI\s*DSS|HITRUST|SOC\s*2|KRITIS)\b",
re.IGNORECASE,
)
# Compound verb patterns (multiple main verbs)
_COMPOUND_VERB_RE = re.compile(
r"\b(?:und|sowie|als\s+auch|or|and)\b",
re.IGNORECASE,
)
# No-split phrases that look compound but aren't
_NO_SPLIT_PHRASES = [
"pflegen und aufrechterhalten",
"dokumentieren und pflegen",
"definieren und dokumentieren",
"erstellen und freigeben",
"pruefen und genehmigen",
"identifizieren und bewerten",
"erkennen und melden",
"define and maintain",
"create and maintain",
"establish and maintain",
"monitor and review",
"detect and respond",
]
@dataclass
class RoutingResult:
"""Result of obligation routing classification."""
routing_type: str # atomic | compound | framework_container | unknown_review
framework_ref: Optional[str] = None
framework_domain: Optional[str] = None
domain_title: Optional[str] = None
confidence: float = 0.0
reason: str = ""
def classify_routing(
obligation_text: str,
action_raw: str,
object_raw: str,
condition_raw: Optional[str] = None,
) -> RoutingResult:
"""Classify an obligation into atomic / compound / framework_container."""
combined = f"{obligation_text} {object_raw}".lower()
# --- Step 1: Framework container detection ---
fw_result = _detect_framework(obligation_text, object_raw)
if fw_result.routing_type == "framework_container":
return fw_result
# --- Step 2: Compound verb detection ---
if _is_compound_obligation(action_raw, obligation_text):
return RoutingResult(
routing_type="compound",
confidence=0.7,
reason="multiple_main_verbs",
)
# --- Step 3: Default = atomic ---
return RoutingResult(
routing_type="atomic",
confidence=0.9,
reason="single_action_single_object",
)
def _detect_framework(
obligation_text: str, object_raw: str,
) -> RoutingResult:
"""Detect if obligation references a framework domain."""
combined = f"{obligation_text} {object_raw}"
registry = get_registry()
alias_idx = _build_alias_index(registry)
# Strategy 1: direct framework name match
m = _DIRECT_FRAMEWORK_RE.search(combined)
if m:
fw_name = m.group(0).strip()
fw_id = _resolve_framework_id(fw_name, alias_idx, registry)
if fw_id:
domain_id, domain_title = _match_domain(
combined, registry[fw_id],
)
return RoutingResult(
routing_type="framework_container",
framework_ref=fw_id,
framework_domain=domain_id,
domain_title=domain_title,
confidence=0.95 if domain_id else 0.75,
reason=f"direct_framework_match:{fw_name}",
)
else:
# Framework name recognized but not in registry
return RoutingResult(
routing_type="framework_container",
framework_ref=None,
framework_domain=None,
confidence=0.6,
reason=f"direct_framework_match_no_registry:{fw_name}",
)
# Strategy 2: pattern match ("Praktiken fuer X")
m2 = _FRAMEWORK_PATTERN.search(combined)
if m2:
ref_text = m2.group(1).strip()
fw_id, domain_id, domain_title = _resolve_from_ref_text(
ref_text, registry, alias_idx,
)
if fw_id:
return RoutingResult(
routing_type="framework_container",
framework_ref=fw_id,
framework_domain=domain_id,
domain_title=domain_title,
confidence=0.85 if domain_id else 0.65,
reason=f"pattern_match:{ref_text}",
)
# Strategy 3: keyword-heavy object
if _has_framework_keywords(object_raw):
return RoutingResult(
routing_type="framework_container",
framework_ref=None,
framework_domain=None,
confidence=0.5,
reason="framework_keywords_in_object",
)
return RoutingResult(routing_type="atomic", confidence=0.0)
def _resolve_framework_id(
name: str,
alias_idx: dict[str, str],
registry: dict[str, dict],
) -> Optional[str]:
"""Resolve a framework name to its registry ID."""
normalized = re.sub(r"\s+", " ", name.strip().lower())
# Direct alias match
if normalized in alias_idx:
return alias_idx[normalized]
# Try compact form (strip spaces, hyphens, underscores)
compact = re.sub(r"[\s_\-]+", "", normalized)
for alias, fw_id in alias_idx.items():
if re.sub(r"[\s_\-]+", "", alias) == compact:
return fw_id
# Substring match in display names
for fw_id, fw in registry.items():
display = fw.get("display_name", "").lower()
if normalized in display or display in normalized:
return fw_id
# Partial match: check if normalized contains any alias (for multi-word refs)
for alias, fw_id in alias_idx.items():
if len(alias) >= 4 and alias in normalized:
return fw_id
return None
def _match_domain(
text: str, framework: dict,
) -> tuple[Optional[str], Optional[str]]:
"""Match a domain within a framework from text references."""
text_lower = text.lower()
best_id: Optional[str] = None
best_title: Optional[str] = None
best_score = 0
for domain in framework.get("domains", []):
score = 0
domain_id = domain["domain_id"]
title = domain.get("title", "")
# Exact domain ID match (e.g. "AIS")
if re.search(rf"\b{re.escape(domain_id)}\b", text, re.IGNORECASE):
score += 10
# Full title match
if title.lower() in text_lower:
score += 8
# Alias match
for alias in domain.get("aliases", []):
if alias.lower() in text_lower:
score += 6
break
# Keyword overlap
kw_hits = sum(
1 for kw in domain.get("keywords", [])
if kw.lower() in text_lower
)
score += kw_hits
if score > best_score:
best_score = score
best_id = domain_id
best_title = title
if best_score >= 3:
return best_id, best_title
return None, None
def _resolve_from_ref_text(
ref_text: str,
registry: dict[str, dict],
alias_idx: dict[str, str],
) -> tuple[Optional[str], Optional[str], Optional[str]]:
"""Resolve framework + domain from a reference text like 'AIS' or 'Application Security'."""
ref_lower = ref_text.lower()
for fw_id, fw in registry.items():
for domain in fw.get("domains", []):
# Check domain ID
if domain["domain_id"].lower() in ref_lower:
return fw_id, domain["domain_id"], domain.get("title")
# Check title
if domain.get("title", "").lower() in ref_lower:
return fw_id, domain["domain_id"], domain.get("title")
# Check aliases
for alias in domain.get("aliases", []):
if alias.lower() in ref_lower or ref_lower in alias.lower():
return fw_id, domain["domain_id"], domain.get("title")
return None, None, None
_FRAMEWORK_KW_SET = {
"praktiken", "kontrollen", "massnahmen", "maßnahmen",
"anforderungen", "vorgaben", "framework", "standard",
"baseline", "katalog", "domain", "family", "category",
"practices", "controls", "measures", "requirements",
}
def _has_framework_keywords(text: str) -> bool:
"""Check if text contains framework-indicator keywords."""
words = set(re.findall(r"[a-zäöüß]+", text.lower()))
return len(words & _FRAMEWORK_KW_SET) >= 2
def _is_compound_obligation(action_raw: str, obligation_text: str) -> bool:
"""Detect if the obligation has multiple competing main verbs."""
if not action_raw:
return False
action_lower = action_raw.lower().strip()
# Check no-split phrases first
for phrase in _NO_SPLIT_PHRASES:
if phrase in action_lower:
return False
# Must have a conjunction
if not _COMPOUND_VERB_RE.search(action_lower):
return False
# Split by conjunctions and check if we get 2+ meaningful verbs
parts = re.split(r"\b(?:und|sowie|als\s+auch|or|and)\b", action_lower)
meaningful = [p.strip() for p in parts if len(p.strip()) >= 3]
return len(meaningful) >= 2
# ---------------------------------------------------------------------------
# Framework Decomposition
# ---------------------------------------------------------------------------
@dataclass
class DecomposedObligation:
"""A concrete obligation derived from a framework container."""
obligation_candidate_id: str
parent_control_id: str
parent_framework_container_id: str
source_ref_law: str
source_ref_article: str
obligation_text: str
actor: str
action_raw: str
object_raw: str
condition_raw: Optional[str] = None
trigger_raw: Optional[str] = None
routing_type: str = "atomic"
release_state: str = "decomposed"
subcontrol_id: str = ""
# Metadata
action_hint: str = ""
object_hint: str = ""
object_class: str = ""
keywords: list[str] = field(default_factory=list)
@dataclass
class FrameworkDecompositionResult:
"""Result of framework decomposition."""
framework_container_id: str
source_obligation_candidate_id: str
framework_ref: Optional[str]
framework_domain: Optional[str]
domain_title: Optional[str]
matched_subcontrols: list[str]
decomposition_confidence: float
release_state: str # decomposed | unmatched | error
decomposed_obligations: list[DecomposedObligation]
issues: list[str]
def decompose_framework_container(
obligation_candidate_id: str,
parent_control_id: str,
obligation_text: str,
framework_ref: Optional[str],
framework_domain: Optional[str],
actor: str = "organization",
) -> FrameworkDecompositionResult:
"""Decompose a framework-container obligation into concrete sub-obligations.
Steps:
1. Resolve framework from registry
2. Resolve domain within framework
3. Select relevant subcontrols (keyword filter or full domain)
4. Generate decomposed obligations
"""
container_id = f"FWC-{uuid.uuid4().hex[:8]}"
registry = get_registry()
issues: list[str] = []
# Step 1: Resolve framework
fw = None
if framework_ref and framework_ref in registry:
fw = registry[framework_ref]
else:
# Try to find by name in text
fw, framework_ref = _find_framework_in_text(obligation_text, registry)
if not fw:
issues.append("ERROR: framework_not_matched")
return FrameworkDecompositionResult(
framework_container_id=container_id,
source_obligation_candidate_id=obligation_candidate_id,
framework_ref=framework_ref,
framework_domain=framework_domain,
domain_title=None,
matched_subcontrols=[],
decomposition_confidence=0.0,
release_state="unmatched",
decomposed_obligations=[],
issues=issues,
)
# Step 2: Resolve domain
domain_data = None
domain_title = None
if framework_domain:
for d in fw.get("domains", []):
if d["domain_id"].lower() == framework_domain.lower():
domain_data = d
domain_title = d.get("title")
break
if not domain_data:
# Try matching from text
domain_id, domain_title = _match_domain(obligation_text, fw)
if domain_id:
for d in fw.get("domains", []):
if d["domain_id"] == domain_id:
domain_data = d
framework_domain = domain_id
break
if not domain_data:
issues.append("WARN: domain_not_matched — using all domains")
# Fall back to all subcontrols across all domains
all_subcontrols = []
for d in fw.get("domains", []):
for sc in d.get("subcontrols", []):
sc["_domain_id"] = d["domain_id"]
all_subcontrols.append(sc)
subcontrols = _select_subcontrols(obligation_text, all_subcontrols)
if not subcontrols:
issues.append("ERROR: no_subcontrols_matched")
return FrameworkDecompositionResult(
framework_container_id=container_id,
source_obligation_candidate_id=obligation_candidate_id,
framework_ref=framework_ref,
framework_domain=framework_domain,
domain_title=None,
matched_subcontrols=[],
decomposition_confidence=0.0,
release_state="unmatched",
decomposed_obligations=[],
issues=issues,
)
else:
# Step 3: Select subcontrols from domain
raw_subcontrols = domain_data.get("subcontrols", [])
subcontrols = _select_subcontrols(obligation_text, raw_subcontrols)
if not subcontrols:
# Full domain decomposition
subcontrols = raw_subcontrols
# Quality check: too many subcontrols
if len(subcontrols) > 25:
issues.append(f"WARN: {len(subcontrols)} subcontrols — may be too broad")
# Step 4: Generate decomposed obligations
display_name = fw.get("display_name", framework_ref or "Unknown")
decomposed: list[DecomposedObligation] = []
matched_ids: list[str] = []
for sc in subcontrols:
sc_id = sc.get("subcontrol_id", "")
matched_ids.append(sc_id)
action_hint = sc.get("action_hint", "")
object_hint = sc.get("object_hint", "")
# Quality warnings
if not action_hint:
issues.append(f"WARN: {sc_id} missing action_hint")
if not object_hint:
issues.append(f"WARN: {sc_id} missing object_hint")
obl_id = f"{obligation_candidate_id}-{sc_id}"
decomposed.append(DecomposedObligation(
obligation_candidate_id=obl_id,
parent_control_id=parent_control_id,
parent_framework_container_id=container_id,
source_ref_law=display_name,
source_ref_article=sc_id,
obligation_text=sc.get("statement", ""),
actor=actor,
action_raw=action_hint or _infer_action(sc.get("statement", "")),
object_raw=object_hint or _infer_object(sc.get("statement", "")),
routing_type="atomic",
release_state="decomposed",
subcontrol_id=sc_id,
action_hint=action_hint,
object_hint=object_hint,
object_class=sc.get("object_class", ""),
keywords=sc.get("keywords", []),
))
# Check if decomposed are identical to container
for d in decomposed:
if d.obligation_text.strip() == obligation_text.strip():
issues.append(f"WARN: {d.subcontrol_id} identical to container text")
confidence = _compute_decomposition_confidence(
framework_ref, framework_domain, domain_data, len(subcontrols), issues,
)
return FrameworkDecompositionResult(
framework_container_id=container_id,
source_obligation_candidate_id=obligation_candidate_id,
framework_ref=framework_ref,
framework_domain=framework_domain,
domain_title=domain_title,
matched_subcontrols=matched_ids,
decomposition_confidence=confidence,
release_state="decomposed",
decomposed_obligations=decomposed,
issues=issues,
)
def _find_framework_in_text(
text: str, registry: dict[str, dict],
) -> tuple[Optional[dict], Optional[str]]:
"""Try to find a framework by searching text for known names."""
alias_idx = _build_alias_index(registry)
m = _DIRECT_FRAMEWORK_RE.search(text)
if m:
fw_id = _resolve_framework_id(m.group(0), alias_idx, registry)
if fw_id and fw_id in registry:
return registry[fw_id], fw_id
return None, None
def _select_subcontrols(
obligation_text: str, subcontrols: list[dict],
) -> list[dict]:
"""Select relevant subcontrols based on keyword matching.
Returns empty list if no targeted match found (caller falls back to
full domain).
"""
text_lower = obligation_text.lower()
scored: list[tuple[int, dict]] = []
for sc in subcontrols:
score = 0
for kw in sc.get("keywords", []):
if kw.lower() in text_lower:
score += 1
# Title match
title = sc.get("title", "").lower()
if title and title in text_lower:
score += 3
# Object hint in text
obj = sc.get("object_hint", "").lower()
if obj and obj in text_lower:
score += 2
if score > 0:
scored.append((score, sc))
if not scored:
return []
# Only return those with meaningful overlap (score >= 2)
scored.sort(key=lambda x: x[0], reverse=True)
return [sc for score, sc in scored if score >= 2]
def _infer_action(statement: str) -> str:
"""Infer a basic action verb from a statement."""
s = statement.lower()
if any(w in s for w in ["definiert", "definieren", "define"]):
return "definieren"
if any(w in s for w in ["implementiert", "implementieren", "implement"]):
return "implementieren"
if any(w in s for w in ["dokumentiert", "dokumentieren", "document"]):
return "dokumentieren"
if any(w in s for w in ["ueberwacht", "ueberwachen", "monitor"]):
return "ueberwachen"
if any(w in s for w in ["getestet", "testen", "test"]):
return "testen"
if any(w in s for w in ["geschuetzt", "schuetzen", "protect"]):
return "implementieren"
if any(w in s for w in ["verwaltet", "verwalten", "manage"]):
return "pflegen"
if any(w in s for w in ["gemeldet", "melden", "report"]):
return "melden"
return "implementieren"
def _infer_object(statement: str) -> str:
"""Infer the primary object from a statement (first noun phrase)."""
# Simple heuristic: take the text after "muessen"/"muss" up to the verb
m = re.search(
r"(?:muessen|muss|m(?:ü|ue)ssen)\s+(.+?)(?:\s+werden|\s+sein|\.|,|$)",
statement,
re.IGNORECASE,
)
if m:
return m.group(1).strip()[:80]
# Fallback: first 80 chars
return statement[:80] if statement else ""
def _compute_decomposition_confidence(
framework_ref: Optional[str],
domain: Optional[str],
domain_data: Optional[dict],
num_subcontrols: int,
issues: list[str],
) -> float:
"""Compute confidence score for the decomposition."""
score = 0.3
if framework_ref:
score += 0.25
if domain:
score += 0.20
if domain_data:
score += 0.10
if 1 <= num_subcontrols <= 15:
score += 0.10
elif num_subcontrols > 15:
score += 0.05 # less confident with too many
# Penalize errors
errors = sum(1 for i in issues if i.startswith("ERROR:"))
score -= errors * 0.15
return round(max(min(score, 1.0), 0.0), 2)
# ---------------------------------------------------------------------------
# Registry statistics (for admin/debugging)
# ---------------------------------------------------------------------------
def registry_stats() -> dict:
"""Return summary statistics about the loaded registry."""
reg = get_registry()
stats = {
"frameworks": len(reg),
"details": [],
}
total_domains = 0
total_subcontrols = 0
for fw_id, fw in reg.items():
domains = fw.get("domains", [])
n_sc = sum(len(d.get("subcontrols", [])) for d in domains)
total_domains += len(domains)
total_subcontrols += n_sc
stats["details"].append({
"framework_id": fw_id,
"display_name": fw.get("display_name", ""),
"domains": len(domains),
"subcontrols": n_sc,
})
stats["total_domains"] = total_domains
stats["total_subcontrols"] = total_subcontrols
return stats
@@ -173,6 +173,7 @@ class LLMProviderType(str, Enum):
"""Supported LLM provider types."""
ANTHROPIC = "anthropic"
SELF_HOSTED = "self_hosted"
OLLAMA = "ollama" # Alias for self_hosted (Ollama-specific)
MOCK = "mock" # For testing
@@ -392,6 +393,7 @@ class SelfHostedProvider(LLMProvider):
"model": self.model,
"prompt": full_prompt,
"stream": False,
"think": False, # Disable thinking mode (qwen3.5 etc.)
"options": {}
}
@@ -549,7 +551,7 @@ def get_llm_config() -> LLMConfig:
vault_path="breakpilot/api_keys/anthropic",
env_var="ANTHROPIC_API_KEY"
)
elif provider_type == LLMProviderType.SELF_HOSTED:
elif provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
api_key = get_secret_from_vault_or_env(
vault_path="breakpilot/api_keys/self_hosted_llm",
env_var="SELF_HOSTED_LLM_KEY"
@@ -558,7 +560,7 @@ def get_llm_config() -> LLMConfig:
# Select model based on provider type
if provider_type == LLMProviderType.ANTHROPIC:
model = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514")
elif provider_type == LLMProviderType.SELF_HOSTED:
elif provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
model = os.getenv("SELF_HOSTED_LLM_MODEL", "qwen2.5:14b")
else:
model = "mock-model"
@@ -591,7 +593,7 @@ def get_llm_provider(config: Optional[LLMConfig] = None) -> LLMProvider:
return MockProvider(config)
return AnthropicProvider(config)
elif config.provider_type == LLMProviderType.SELF_HOSTED:
elif config.provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
if not config.base_url:
logger.warning("No self-hosted LLM URL found, using mock provider")
return MockProvider(config)
@@ -0,0 +1,59 @@
"""Shared normative language patterns for assertion classification.
Extracted from decomposition_pass.py for reuse in the assertion engine.
"""
import re
_PFLICHT_SIGNALS = [
r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b",
r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b",
r"\bist\s+verpflichtet\b",
r"\bist\s+zu\s+\w+en\b", r"\bsind\s+zu\s+\w+en\b",
r"\bhat\s+zu\s+\w+en\b", r"\bhaben\s+zu\s+\w+en\b",
r"\bist\s+\w+zu\w+en\b", r"\bsind\s+\w+zu\w+en\b",
r"\bist\s+\w+\s+zu\s+\w+en\b", r"\bsind\s+\w+\s+zu\s+\w+en\b",
r"\bhat\s+\w+\s+zu\s+\w+en\b", r"\bhaben\s+\w+\s+zu\s+\w+en\b",
r"\bshall\b", r"\bmust\b", r"\brequired\b",
r"\b\w+zuteilen\b", r"\b\w+zuwenden\b", r"\b\w+zustellen\b", r"\b\w+zulegen\b",
r"\b\w+zunehmen\b", r"\b\w+zuführen\b", r"\b\w+zuhalten\b", r"\b\w+zusetzen\b",
r"\b\w+zuweisen\b", r"\b\w+zuordnen\b", r"\b\w+zufügen\b", r"\b\w+zugeben\b",
r"\bist\b.{1,80}\bzu\s+\w+en\b", r"\bsind\b.{1,80}\bzu\s+\w+en\b",
]
PFLICHT_RE = re.compile("|".join(_PFLICHT_SIGNALS), re.IGNORECASE)
_EMPFEHLUNG_SIGNALS = [
r"\bsoll\b", r"\bsollen\b", r"\bsollte\b", r"\bsollten\b",
r"\bgewährleisten\b", r"\bsicherstellen\b",
r"\bshould\b", r"\bensure\b", r"\brecommend\w*\b",
r"\bnachweisen\b", r"\beinhalten\b", r"\bunterlassen\b", r"\bwahren\b",
r"\bdokumentieren\b", r"\bimplementieren\b", r"\büberprüfen\b", r"\büberwachen\b",
r"\bprüfen,\s+ob\b", r"\bkontrollieren,\s+ob\b",
]
EMPFEHLUNG_RE = re.compile("|".join(_EMPFEHLUNG_SIGNALS), re.IGNORECASE)
_KANN_SIGNALS = [
r"\bkann\b", r"\bkönnen\b", r"\bdarf\b", r"\bdürfen\b",
r"\bmay\b", r"\boptional\b",
]
KANN_RE = re.compile("|".join(_KANN_SIGNALS), re.IGNORECASE)
NORMATIVE_RE = re.compile(
"|".join(_PFLICHT_SIGNALS + _EMPFEHLUNG_SIGNALS + _KANN_SIGNALS),
re.IGNORECASE,
)
_RATIONALE_SIGNALS = [
r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung",
r"\bbecause\b", r"\breason\b", r"\brationale\b",
r"\bkönnen\s+.*\s+verursachen\b", r"\bführt\s+zu\b",
]
RATIONALE_RE = re.compile("|".join(_RATIONALE_SIGNALS), re.IGNORECASE)
# Evidence-related keywords (for fact detection)
_EVIDENCE_KEYWORDS = [
r"\bnachweis\b", r"\bzertifikat\b", r"\baudit.report\b",
r"\bprotokoll\b", r"\bdokumentation\b", r"\bbericht\b",
r"\bcertificate\b", r"\bevidence\b", r"\bproof\b",
]
EVIDENCE_RE = re.compile("|".join(_EVIDENCE_KEYWORDS), re.IGNORECASE)
@@ -0,0 +1,563 @@
"""Obligation Extractor — 3-Tier Chunk-to-Obligation Linking.
Maps RAG chunks to obligations from the v2 obligation framework using
three tiers (fastest first):
Tier 1: EXACT MATCH regulation_code + article obligation_id (~40%)
Tier 2: EMBEDDING chunk text vs. obligation descriptions (~30%)
Tier 3: LLM EXTRACT local Ollama extracts obligation text (~25%)
Part of the Multi-Layer Control Architecture (Phase 4 of 8).
"""
import json
import logging
import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
# Embedding similarity thresholds for Tier 2
EMBEDDING_MATCH_THRESHOLD = 0.80
EMBEDDING_CANDIDATE_THRESHOLD = 0.60
# ---------------------------------------------------------------------------
# Regulation code mapping: RAG chunk codes → obligation file regulation IDs
# ---------------------------------------------------------------------------
_REGULATION_CODE_TO_ID = {
# DSGVO
"eu_2016_679": "dsgvo",
"dsgvo": "dsgvo",
"gdpr": "dsgvo",
# AI Act
"eu_2024_1689": "ai_act",
"ai_act": "ai_act",
"aiact": "ai_act",
# NIS2
"eu_2022_2555": "nis2",
"nis2": "nis2",
"bsig": "nis2",
# BDSG
"bdsg": "bdsg",
# TTDSG
"ttdsg": "ttdsg",
# DSA
"eu_2022_2065": "dsa",
"dsa": "dsa",
# Data Act
"eu_2023_2854": "data_act",
"data_act": "data_act",
# EU Machinery
"eu_2023_1230": "eu_machinery",
"eu_machinery": "eu_machinery",
# DORA
"eu_2022_2554": "dora",
"dora": "dora",
}
@dataclass
class ObligationMatch:
"""Result of obligation extraction."""
obligation_id: Optional[str] = None
obligation_title: Optional[str] = None
obligation_text: Optional[str] = None
method: str = "none" # exact_match | embedding_match | llm_extracted | inferred
confidence: float = 0.0
regulation_id: Optional[str] = None # e.g. "dsgvo"
def to_dict(self) -> dict:
return {
"obligation_id": self.obligation_id,
"obligation_title": self.obligation_title,
"obligation_text": self.obligation_text,
"method": self.method,
"confidence": self.confidence,
"regulation_id": self.regulation_id,
}
@dataclass
class _ObligationEntry:
"""Internal representation of a loaded obligation."""
id: str
title: str
description: str
regulation_id: str
articles: list[str] = field(default_factory=list) # normalized: ["art. 30", "§ 38"]
embedding: list[float] = field(default_factory=list)
class ObligationExtractor:
"""3-Tier obligation extraction from RAG chunks.
Usage::
extractor = ObligationExtractor()
await extractor.initialize() # loads obligations + embeddings
match = await extractor.extract(
chunk_text="...",
regulation_code="eu_2016_679",
article="Art. 30",
paragraph="Abs. 1",
)
"""
def __init__(self):
self._article_lookup: dict[str, list[str]] = {} # "dsgvo/art. 30" → ["DSGVO-OBL-001"]
self._obligations: dict[str, _ObligationEntry] = {} # id → entry
self._obligation_embeddings: list[list[float]] = []
self._obligation_ids: list[str] = []
self._initialized = False
async def initialize(self) -> None:
"""Load all obligations from v2 JSON files and compute embeddings."""
if self._initialized:
return
self._load_obligations()
await self._compute_embeddings()
self._initialized = True
logger.info(
"ObligationExtractor initialized: %d obligations, %d article lookups, %d embeddings",
len(self._obligations),
len(self._article_lookup),
sum(1 for e in self._obligation_embeddings if e),
)
async def extract(
self,
chunk_text: str,
regulation_code: str,
article: Optional[str] = None,
paragraph: Optional[str] = None,
) -> ObligationMatch:
"""Extract obligation from a chunk using 3-tier strategy."""
if not self._initialized:
await self.initialize()
reg_id = _normalize_regulation(regulation_code)
# Tier 1: Exact match via article lookup
if article:
match = self._tier1_exact(reg_id, article)
if match:
return match
# Tier 2: Embedding similarity
match = await self._tier2_embedding(chunk_text, reg_id)
if match:
return match
# Tier 3: LLM extraction
match = await self._tier3_llm(chunk_text, regulation_code, article)
return match
# -----------------------------------------------------------------------
# Tier 1: Exact Match
# -----------------------------------------------------------------------
def _tier1_exact(self, reg_id: Optional[str], article: str) -> Optional[ObligationMatch]:
"""Look up obligation by regulation + article."""
if not reg_id:
return None
norm_article = _normalize_article(article)
key = f"{reg_id}/{norm_article}"
obl_ids = self._article_lookup.get(key)
if not obl_ids:
return None
# Take the first match (highest priority)
obl_id = obl_ids[0]
entry = self._obligations.get(obl_id)
if not entry:
return None
return ObligationMatch(
obligation_id=entry.id,
obligation_title=entry.title,
obligation_text=entry.description,
method="exact_match",
confidence=1.0,
regulation_id=reg_id,
)
# -----------------------------------------------------------------------
# Tier 2: Embedding Match
# -----------------------------------------------------------------------
async def _tier2_embedding(
self, chunk_text: str, reg_id: Optional[str]
) -> Optional[ObligationMatch]:
"""Find nearest obligation by embedding similarity."""
if not self._obligation_embeddings:
return None
chunk_embedding = await _get_embedding(chunk_text[:2000])
if not chunk_embedding:
return None
best_idx = -1
best_score = 0.0
for i, obl_emb in enumerate(self._obligation_embeddings):
if not obl_emb:
continue
# Prefer same-regulation matches
obl_id = self._obligation_ids[i]
entry = self._obligations.get(obl_id)
score = _cosine_sim(chunk_embedding, obl_emb)
# Domain bonus: +0.05 if same regulation
if entry and reg_id and entry.regulation_id == reg_id:
score += 0.05
if score > best_score:
best_score = score
best_idx = i
if best_idx < 0:
return None
# Remove domain bonus for threshold comparison
raw_score = best_score
obl_id = self._obligation_ids[best_idx]
entry = self._obligations.get(obl_id)
if entry and reg_id and entry.regulation_id == reg_id:
raw_score -= 0.05
if raw_score >= EMBEDDING_MATCH_THRESHOLD:
return ObligationMatch(
obligation_id=entry.id if entry else obl_id,
obligation_title=entry.title if entry else None,
obligation_text=entry.description if entry else None,
method="embedding_match",
confidence=round(min(raw_score, 1.0), 3),
regulation_id=entry.regulation_id if entry else reg_id,
)
return None
# -----------------------------------------------------------------------
# Tier 3: LLM Extraction
# -----------------------------------------------------------------------
async def _tier3_llm(
self, chunk_text: str, regulation_code: str, article: Optional[str]
) -> ObligationMatch:
"""Use local LLM to extract the obligation from the chunk."""
prompt = f"""Analysiere den folgenden Gesetzestext und extrahiere die zentrale rechtliche Pflicht.
Text:
{chunk_text[:3000]}
Quelle: {regulation_code} {article or ''}
Antworte NUR als JSON:
{{
"obligation_text": "Die zentrale Pflicht in einem Satz",
"actor": "Wer muss handeln (z.B. Verantwortlicher, Auftragsverarbeiter)",
"action": "Was muss getan werden",
"normative_strength": "muss|soll|kann"
}}"""
system_prompt = (
"Du bist ein Rechtsexperte fuer EU-Datenschutz- und Digitalrecht. "
"Extrahiere die zentrale rechtliche Pflicht aus Gesetzestexten. "
"Antworte ausschliesslich als JSON."
)
result_text = await _llm_ollama(prompt, system_prompt)
if not result_text:
return ObligationMatch(
method="llm_extracted",
confidence=0.0,
regulation_id=_normalize_regulation(regulation_code),
)
parsed = _parse_json(result_text)
obligation_text = parsed.get("obligation_text", result_text[:500])
return ObligationMatch(
obligation_id=None,
obligation_title=None,
obligation_text=obligation_text,
method="llm_extracted",
confidence=0.60,
regulation_id=_normalize_regulation(regulation_code),
)
# -----------------------------------------------------------------------
# Initialization helpers
# -----------------------------------------------------------------------
def _load_obligations(self) -> None:
"""Load all obligation files from v2 framework."""
v2_dir = _find_obligations_dir()
if not v2_dir:
logger.warning("Obligations v2 directory not found — Tier 1 disabled")
return
manifest_path = v2_dir / "_manifest.json"
if not manifest_path.exists():
logger.warning("Manifest not found at %s", manifest_path)
return
with open(manifest_path) as f:
manifest = json.load(f)
for reg_info in manifest.get("regulations", []):
reg_id = reg_info["id"]
reg_file = v2_dir / reg_info["file"]
if not reg_file.exists():
logger.warning("Regulation file not found: %s", reg_file)
continue
with open(reg_file) as f:
data = json.load(f)
for obl in data.get("obligations", []):
obl_id = obl["id"]
entry = _ObligationEntry(
id=obl_id,
title=obl.get("title", ""),
description=obl.get("description", ""),
regulation_id=reg_id,
)
# Build article lookup from legal_basis
for basis in obl.get("legal_basis", []):
article_raw = basis.get("article", "")
if article_raw:
norm_art = _normalize_article(article_raw)
key = f"{reg_id}/{norm_art}"
if key not in self._article_lookup:
self._article_lookup[key] = []
self._article_lookup[key].append(obl_id)
entry.articles.append(norm_art)
self._obligations[obl_id] = entry
logger.info(
"Loaded %d obligations from %d regulations",
len(self._obligations),
len(manifest.get("regulations", [])),
)
async def _compute_embeddings(self) -> None:
"""Compute embeddings for all obligation descriptions."""
if not self._obligations:
return
self._obligation_ids = list(self._obligations.keys())
texts = [
f"{self._obligations[oid].title}: {self._obligations[oid].description}"
for oid in self._obligation_ids
]
logger.info("Computing embeddings for %d obligations...", len(texts))
self._obligation_embeddings = await _get_embeddings_batch(texts)
valid = sum(1 for e in self._obligation_embeddings if e)
logger.info("Got %d/%d valid embeddings", valid, len(texts))
# -----------------------------------------------------------------------
# Stats
# -----------------------------------------------------------------------
def stats(self) -> dict:
"""Return initialization statistics."""
return {
"total_obligations": len(self._obligations),
"article_lookups": len(self._article_lookup),
"embeddings_valid": sum(1 for e in self._obligation_embeddings if e),
"regulations": list(
{e.regulation_id for e in self._obligations.values()}
),
"initialized": self._initialized,
}
# ---------------------------------------------------------------------------
# Module-level helpers (reusable by other modules)
# ---------------------------------------------------------------------------
def _normalize_regulation(regulation_code: str) -> Optional[str]:
"""Map a RAG regulation_code to obligation framework regulation ID."""
if not regulation_code:
return None
code = regulation_code.lower().strip()
# Direct lookup
if code in _REGULATION_CODE_TO_ID:
return _REGULATION_CODE_TO_ID[code]
# Prefix matching for families
for prefix, reg_id in [
("eu_2016_679", "dsgvo"),
("eu_2024_1689", "ai_act"),
("eu_2022_2555", "nis2"),
("eu_2022_2065", "dsa"),
("eu_2023_2854", "data_act"),
("eu_2023_1230", "eu_machinery"),
("eu_2022_2554", "dora"),
]:
if code.startswith(prefix):
return reg_id
return None
def _normalize_article(article: str) -> str:
"""Normalize article references for consistent lookup.
Examples:
"Art. 30" "art. 30"
"§ 38 BDSG" "§ 38"
"Article 10" "art. 10"
"Art. 30 Abs. 1" "art. 30"
"Artikel 35" "art. 35"
"""
if not article:
return ""
s = article.strip()
# Remove trailing law name: "§ 38 BDSG" → "§ 38"
s = re.sub(r"\s+(DSGVO|BDSG|TTDSG|DSA|NIS2|DORA|AI.?Act)\s*$", "", s, flags=re.IGNORECASE)
# Remove paragraph references: "Art. 30 Abs. 1" → "Art. 30"
s = re.sub(r"\s+(Abs|Absatz|para|paragraph|lit|Satz)\.?\s+.*$", "", s, flags=re.IGNORECASE)
# Normalize "Article" / "Artikel" → "Art."
s = re.sub(r"^(Article|Artikel)\s+", "Art. ", s, flags=re.IGNORECASE)
return s.lower().strip()
def _cosine_sim(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
def _find_obligations_dir() -> Optional[Path]:
"""Locate the obligations v2 directory."""
candidates = [
Path(__file__).resolve().parent.parent.parent.parent
/ "ai-compliance-sdk" / "policies" / "obligations" / "v2",
Path("/app/ai-compliance-sdk/policies/obligations/v2"),
Path("ai-compliance-sdk/policies/obligations/v2"),
]
for p in candidates:
if p.is_dir() and (p / "_manifest.json").exists():
return p
return None
async def _get_embedding(text: str) -> list[float]:
"""Get embedding vector for a single text."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": [text]},
)
resp.raise_for_status()
embeddings = resp.json().get("embeddings", [])
return embeddings[0] if embeddings else []
except Exception:
return []
async def _get_embeddings_batch(
texts: list[str], batch_size: int = 32
) -> list[list[float]]:
"""Get embeddings for multiple texts in batches."""
all_embeddings: list[list[float]] = []
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": batch},
)
resp.raise_for_status()
embeddings = resp.json().get("embeddings", [])
all_embeddings.extend(embeddings)
except Exception as e:
logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
all_embeddings.extend([[] for _ in batch])
return all_embeddings
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"""Call local Ollama for LLM extraction."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 512},
"think": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
if resp.status_code != 200:
logger.error(
"Ollama chat failed %d: %s", resp.status_code, resp.text[:300]
)
return ""
data = resp.json()
return data.get("message", {}).get("content", "")
except Exception as e:
logger.warning("Ollama call failed: %s", e)
return ""
def _parse_json(text: str) -> dict:
"""Extract JSON from LLM response text."""
# Try direct parse
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try extracting JSON block
match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return {}
@@ -0,0 +1,532 @@
"""Pattern Matcher — Obligation-to-Control-Pattern Linking.
Maps obligations (from the ObligationExtractor) to control patterns
using two tiers:
Tier 1: KEYWORD MATCH obligation_match_keywords from patterns (~70%)
Tier 2: EMBEDDING cosine similarity with domain bonus (~25%)
Part of the Multi-Layer Control Architecture (Phase 5 of 8).
"""
import logging
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import yaml
from compliance.services.obligation_extractor import (
_cosine_sim,
_get_embedding,
_get_embeddings_batch,
)
logger = logging.getLogger(__name__)
# Minimum keyword score to accept a match (at least 2 keyword hits)
KEYWORD_MATCH_MIN_HITS = 2
# Embedding threshold for Tier 2
EMBEDDING_PATTERN_THRESHOLD = 0.75
# Domain bonus when regulation maps to the pattern's domain
DOMAIN_BONUS = 0.10
# Map regulation IDs to pattern domains that are likely relevant
_REGULATION_DOMAIN_AFFINITY = {
"dsgvo": ["DATA", "COMP", "GOV"],
"bdsg": ["DATA", "COMP"],
"ttdsg": ["DATA"],
"ai_act": ["AI", "COMP", "DATA"],
"nis2": ["SEC", "INC", "NET", "LOG", "CRYP"],
"dsa": ["DATA", "COMP"],
"data_act": ["DATA", "COMP"],
"eu_machinery": ["SEC", "COMP"],
"dora": ["SEC", "INC", "FIN", "COMP"],
}
@dataclass
class ControlPattern:
"""Python representation of a control pattern from YAML."""
id: str
name: str
name_de: str
domain: str
category: str
description: str
objective_template: str
rationale_template: str
requirements_template: list[str] = field(default_factory=list)
test_procedure_template: list[str] = field(default_factory=list)
evidence_template: list[str] = field(default_factory=list)
severity_default: str = "medium"
implementation_effort_default: str = "m"
obligation_match_keywords: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
composable_with: list[str] = field(default_factory=list)
open_anchor_refs: list[dict] = field(default_factory=list)
@dataclass
class PatternMatchResult:
"""Result of pattern matching."""
pattern: Optional[ControlPattern] = None
pattern_id: Optional[str] = None
method: str = "none" # keyword | embedding | combined | none
confidence: float = 0.0
keyword_hits: int = 0
total_keywords: int = 0
embedding_score: float = 0.0
domain_bonus_applied: bool = False
composable_patterns: list[str] = field(default_factory=list)
def to_dict(self) -> dict:
return {
"pattern_id": self.pattern_id,
"method": self.method,
"confidence": round(self.confidence, 3),
"keyword_hits": self.keyword_hits,
"total_keywords": self.total_keywords,
"embedding_score": round(self.embedding_score, 3),
"domain_bonus_applied": self.domain_bonus_applied,
"composable_patterns": self.composable_patterns,
}
class PatternMatcher:
"""Links obligations to control patterns using keyword + embedding matching.
Usage::
matcher = PatternMatcher()
await matcher.initialize()
result = await matcher.match(
obligation_text="Fuehrung eines Verarbeitungsverzeichnisses...",
regulation_id="dsgvo",
)
print(result.pattern_id) # e.g. "CP-COMP-001"
print(result.confidence) # e.g. 0.85
"""
def __init__(self):
self._patterns: list[ControlPattern] = []
self._by_id: dict[str, ControlPattern] = {}
self._by_domain: dict[str, list[ControlPattern]] = {}
self._keyword_index: dict[str, list[str]] = {} # keyword → [pattern_ids]
self._pattern_embeddings: list[list[float]] = []
self._pattern_ids: list[str] = []
self._initialized = False
async def initialize(self) -> None:
"""Load patterns from YAML and compute embeddings."""
if self._initialized:
return
self._load_patterns()
self._build_keyword_index()
await self._compute_embeddings()
self._initialized = True
logger.info(
"PatternMatcher initialized: %d patterns, %d keywords, %d embeddings",
len(self._patterns),
len(self._keyword_index),
sum(1 for e in self._pattern_embeddings if e),
)
async def match(
self,
obligation_text: str,
regulation_id: Optional[str] = None,
top_n: int = 1,
) -> PatternMatchResult:
"""Match obligation text to the best control pattern.
Args:
obligation_text: The obligation description to match against.
regulation_id: Source regulation (for domain bonus).
top_n: Number of top results to consider for composability.
Returns:
PatternMatchResult with the best match.
"""
if not self._initialized:
await self.initialize()
if not obligation_text or not self._patterns:
return PatternMatchResult()
# Tier 1: Keyword matching
keyword_result = self._tier1_keyword(obligation_text, regulation_id)
# Tier 2: Embedding matching
embedding_result = await self._tier2_embedding(obligation_text, regulation_id)
# Combine scores: prefer keyword match, boost with embedding if available
best = self._combine_results(keyword_result, embedding_result)
# Attach composable patterns
if best.pattern:
best.composable_patterns = [
pid for pid in best.pattern.composable_with
if pid in self._by_id
]
return best
async def match_top_n(
self,
obligation_text: str,
regulation_id: Optional[str] = None,
n: int = 3,
) -> list[PatternMatchResult]:
"""Return top-N pattern matches sorted by confidence descending."""
if not self._initialized:
await self.initialize()
if not obligation_text or not self._patterns:
return []
keyword_scores = self._keyword_scores(obligation_text, regulation_id)
embedding_scores = await self._embedding_scores(obligation_text, regulation_id)
# Merge scores
all_pattern_ids = set(keyword_scores.keys()) | set(embedding_scores.keys())
results: list[PatternMatchResult] = []
for pid in all_pattern_ids:
pattern = self._by_id.get(pid)
if not pattern:
continue
kw_score = keyword_scores.get(pid, (0, 0, 0.0)) # (hits, total, score)
emb_score = embedding_scores.get(pid, (0.0, False)) # (score, bonus_applied)
kw_hits, kw_total, kw_confidence = kw_score
emb_confidence, bonus_applied = emb_score
# Combined confidence: max of keyword and embedding, with boost if both
if kw_confidence > 0 and emb_confidence > 0:
combined = max(kw_confidence, emb_confidence) + 0.05
method = "combined"
elif kw_confidence > 0:
combined = kw_confidence
method = "keyword"
else:
combined = emb_confidence
method = "embedding"
results.append(PatternMatchResult(
pattern=pattern,
pattern_id=pid,
method=method,
confidence=min(combined, 1.0),
keyword_hits=kw_hits,
total_keywords=kw_total,
embedding_score=emb_confidence,
domain_bonus_applied=bonus_applied,
composable_patterns=[
p for p in pattern.composable_with if p in self._by_id
],
))
# Sort by confidence descending
results.sort(key=lambda r: r.confidence, reverse=True)
return results[:n]
# -----------------------------------------------------------------------
# Tier 1: Keyword Match
# -----------------------------------------------------------------------
def _tier1_keyword(
self, obligation_text: str, regulation_id: Optional[str]
) -> Optional[PatternMatchResult]:
"""Match by counting keyword hits in the obligation text."""
scores = self._keyword_scores(obligation_text, regulation_id)
if not scores:
return None
# Find best match
best_pid = max(scores, key=lambda pid: scores[pid][2])
hits, total, confidence = scores[best_pid]
if hits < KEYWORD_MATCH_MIN_HITS:
return None
pattern = self._by_id.get(best_pid)
if not pattern:
return None
# Check domain bonus
bonus_applied = False
if regulation_id and self._domain_matches(pattern.domain, regulation_id):
confidence = min(confidence + DOMAIN_BONUS, 1.0)
bonus_applied = True
return PatternMatchResult(
pattern=pattern,
pattern_id=best_pid,
method="keyword",
confidence=confidence,
keyword_hits=hits,
total_keywords=total,
domain_bonus_applied=bonus_applied,
)
def _keyword_scores(
self, text: str, regulation_id: Optional[str]
) -> dict[str, tuple[int, int, float]]:
"""Compute keyword match scores for all patterns.
Returns dict: pattern_id (hits, total_keywords, confidence).
"""
text_lower = text.lower()
hits_by_pattern: dict[str, int] = {}
for keyword, pattern_ids in self._keyword_index.items():
if keyword in text_lower:
for pid in pattern_ids:
hits_by_pattern[pid] = hits_by_pattern.get(pid, 0) + 1
result: dict[str, tuple[int, int, float]] = {}
for pid, hits in hits_by_pattern.items():
pattern = self._by_id.get(pid)
if not pattern:
continue
total = len(pattern.obligation_match_keywords)
confidence = hits / total if total > 0 else 0.0
result[pid] = (hits, total, confidence)
return result
# -----------------------------------------------------------------------
# Tier 2: Embedding Match
# -----------------------------------------------------------------------
async def _tier2_embedding(
self, obligation_text: str, regulation_id: Optional[str]
) -> Optional[PatternMatchResult]:
"""Match by embedding similarity against pattern objective_templates."""
scores = await self._embedding_scores(obligation_text, regulation_id)
if not scores:
return None
best_pid = max(scores, key=lambda pid: scores[pid][0])
emb_score, bonus_applied = scores[best_pid]
if emb_score < EMBEDDING_PATTERN_THRESHOLD:
return None
pattern = self._by_id.get(best_pid)
if not pattern:
return None
return PatternMatchResult(
pattern=pattern,
pattern_id=best_pid,
method="embedding",
confidence=min(emb_score, 1.0),
embedding_score=emb_score,
domain_bonus_applied=bonus_applied,
)
async def _embedding_scores(
self, obligation_text: str, regulation_id: Optional[str]
) -> dict[str, tuple[float, bool]]:
"""Compute embedding similarity scores for all patterns.
Returns dict: pattern_id (score, domain_bonus_applied).
"""
if not self._pattern_embeddings:
return {}
chunk_embedding = await _get_embedding(obligation_text[:2000])
if not chunk_embedding:
return {}
result: dict[str, tuple[float, bool]] = {}
for i, pat_emb in enumerate(self._pattern_embeddings):
if not pat_emb:
continue
pid = self._pattern_ids[i]
pattern = self._by_id.get(pid)
if not pattern:
continue
score = _cosine_sim(chunk_embedding, pat_emb)
# Domain bonus
bonus_applied = False
if regulation_id and self._domain_matches(pattern.domain, regulation_id):
score += DOMAIN_BONUS
bonus_applied = True
result[pid] = (score, bonus_applied)
return result
# -----------------------------------------------------------------------
# Score combination
# -----------------------------------------------------------------------
def _combine_results(
self,
keyword_result: Optional[PatternMatchResult],
embedding_result: Optional[PatternMatchResult],
) -> PatternMatchResult:
"""Combine keyword and embedding results into the best match."""
if not keyword_result and not embedding_result:
return PatternMatchResult()
if not keyword_result:
return embedding_result
if not embedding_result:
return keyword_result
# Both matched — check if they agree
if keyword_result.pattern_id == embedding_result.pattern_id:
# Same pattern: boost confidence
combined_confidence = min(
max(keyword_result.confidence, embedding_result.confidence) + 0.05,
1.0,
)
return PatternMatchResult(
pattern=keyword_result.pattern,
pattern_id=keyword_result.pattern_id,
method="combined",
confidence=combined_confidence,
keyword_hits=keyword_result.keyword_hits,
total_keywords=keyword_result.total_keywords,
embedding_score=embedding_result.embedding_score,
domain_bonus_applied=(
keyword_result.domain_bonus_applied
or embedding_result.domain_bonus_applied
),
)
# Different patterns: pick the one with higher confidence
if keyword_result.confidence >= embedding_result.confidence:
return keyword_result
return embedding_result
# -----------------------------------------------------------------------
# Domain affinity
# -----------------------------------------------------------------------
@staticmethod
def _domain_matches(pattern_domain: str, regulation_id: str) -> bool:
"""Check if a pattern's domain has affinity with a regulation."""
affine_domains = _REGULATION_DOMAIN_AFFINITY.get(regulation_id, [])
return pattern_domain in affine_domains
# -----------------------------------------------------------------------
# Initialization helpers
# -----------------------------------------------------------------------
def _load_patterns(self) -> None:
"""Load control patterns from YAML files."""
patterns_dir = _find_patterns_dir()
if not patterns_dir:
logger.warning("Control patterns directory not found")
return
for yaml_file in sorted(patterns_dir.glob("*.yaml")):
if yaml_file.name.startswith("_"):
continue
try:
with open(yaml_file) as f:
data = yaml.safe_load(f)
if not data or "patterns" not in data:
continue
for p in data["patterns"]:
pattern = ControlPattern(
id=p["id"],
name=p["name"],
name_de=p["name_de"],
domain=p["domain"],
category=p["category"],
description=p["description"],
objective_template=p["objective_template"],
rationale_template=p["rationale_template"],
requirements_template=p.get("requirements_template", []),
test_procedure_template=p.get("test_procedure_template", []),
evidence_template=p.get("evidence_template", []),
severity_default=p.get("severity_default", "medium"),
implementation_effort_default=p.get("implementation_effort_default", "m"),
obligation_match_keywords=p.get("obligation_match_keywords", []),
tags=p.get("tags", []),
composable_with=p.get("composable_with", []),
open_anchor_refs=p.get("open_anchor_refs", []),
)
self._patterns.append(pattern)
self._by_id[pattern.id] = pattern
domain_list = self._by_domain.setdefault(pattern.domain, [])
domain_list.append(pattern)
except Exception as e:
logger.error("Failed to load %s: %s", yaml_file.name, e)
logger.info("Loaded %d patterns from %s", len(self._patterns), patterns_dir)
def _build_keyword_index(self) -> None:
"""Build reverse index: keyword → [pattern_ids]."""
for pattern in self._patterns:
for kw in pattern.obligation_match_keywords:
lower_kw = kw.lower()
if lower_kw not in self._keyword_index:
self._keyword_index[lower_kw] = []
self._keyword_index[lower_kw].append(pattern.id)
async def _compute_embeddings(self) -> None:
"""Compute embeddings for all pattern objective templates."""
if not self._patterns:
return
self._pattern_ids = [p.id for p in self._patterns]
texts = [
f"{p.name_de}: {p.objective_template}"
for p in self._patterns
]
logger.info("Computing embeddings for %d patterns...", len(texts))
self._pattern_embeddings = await _get_embeddings_batch(texts)
valid = sum(1 for e in self._pattern_embeddings if e)
logger.info("Got %d/%d valid pattern embeddings", valid, len(texts))
# -----------------------------------------------------------------------
# Public helpers
# -----------------------------------------------------------------------
def get_pattern(self, pattern_id: str) -> Optional[ControlPattern]:
"""Get a pattern by its ID."""
return self._by_id.get(pattern_id.upper())
def get_patterns_by_domain(self, domain: str) -> list[ControlPattern]:
"""Get all patterns for a domain."""
return self._by_domain.get(domain.upper(), [])
def stats(self) -> dict:
"""Return matcher statistics."""
return {
"total_patterns": len(self._patterns),
"domains": list(self._by_domain.keys()),
"keywords": len(self._keyword_index),
"embeddings_valid": sum(1 for e in self._pattern_embeddings if e),
"initialized": self._initialized,
}
def _find_patterns_dir() -> Optional[Path]:
"""Locate the control_patterns directory."""
candidates = [
Path(__file__).resolve().parent.parent.parent.parent
/ "ai-compliance-sdk" / "policies" / "control_patterns",
Path("/app/ai-compliance-sdk/policies/control_patterns"),
Path("ai-compliance-sdk/policies/control_patterns"),
]
for p in candidates:
if p.is_dir():
return p
return None
@@ -0,0 +1,670 @@
"""Pipeline Adapter — New 10-Stage Pipeline Integration.
Bridges the existing 7-stage control_generator pipeline with the new
multi-layer components (ObligationExtractor, PatternMatcher, ControlComposer).
New pipeline flow:
chunk license_classify
obligation_extract (Stage 4 NEW)
pattern_match (Stage 5 NEW)
control_compose (Stage 6 replaces old Stage 3)
harmonize anchor store + crosswalk mark processed
Can be used in two modes:
1. INLINE: Called from _process_batch() to enrich the pipeline
2. STANDALONE: Process chunks directly through new stages
Part of the Multi-Layer Control Architecture (Phase 7 of 8).
"""
import hashlib
import json
import logging
from dataclasses import dataclass, field
from typing import Optional
from sqlalchemy import text
from sqlalchemy.orm import Session
from compliance.services.control_composer import ComposedControl, ControlComposer
from compliance.services.obligation_extractor import ObligationExtractor, ObligationMatch
from compliance.services.pattern_matcher import PatternMatcher, PatternMatchResult
logger = logging.getLogger(__name__)
@dataclass
class PipelineChunk:
"""Input chunk for the new pipeline stages."""
text: str
collection: str = ""
regulation_code: str = ""
article: Optional[str] = None
paragraph: Optional[str] = None
license_rule: int = 3
license_info: dict = field(default_factory=dict)
source_citation: Optional[dict] = None
chunk_hash: str = ""
def compute_hash(self) -> str:
if not self.chunk_hash:
self.chunk_hash = hashlib.sha256(self.text.encode()).hexdigest()
return self.chunk_hash
@dataclass
class PipelineResult:
"""Result of processing a chunk through the new pipeline."""
chunk: PipelineChunk
obligation: ObligationMatch = field(default_factory=ObligationMatch)
pattern_result: PatternMatchResult = field(default_factory=PatternMatchResult)
control: Optional[ComposedControl] = None
crosswalk_written: bool = False
error: Optional[str] = None
def to_dict(self) -> dict:
return {
"chunk_hash": self.chunk.chunk_hash,
"obligation": self.obligation.to_dict() if self.obligation else None,
"pattern": self.pattern_result.to_dict() if self.pattern_result else None,
"control": self.control.to_dict() if self.control else None,
"crosswalk_written": self.crosswalk_written,
"error": self.error,
}
class PipelineAdapter:
"""Integrates ObligationExtractor + PatternMatcher + ControlComposer.
Usage::
adapter = PipelineAdapter(db)
await adapter.initialize()
result = await adapter.process_chunk(PipelineChunk(
text="...",
regulation_code="eu_2016_679",
article="Art. 30",
license_rule=1,
))
"""
def __init__(self, db: Optional[Session] = None):
self.db = db
self._extractor = ObligationExtractor()
self._matcher = PatternMatcher()
self._composer = ControlComposer()
self._initialized = False
async def initialize(self) -> None:
"""Initialize all sub-components."""
if self._initialized:
return
await self._extractor.initialize()
await self._matcher.initialize()
self._initialized = True
logger.info("PipelineAdapter initialized")
async def process_chunk(self, chunk: PipelineChunk) -> PipelineResult:
"""Process a single chunk through the new 3-stage pipeline.
Stage 4: Obligation Extract
Stage 5: Pattern Match
Stage 6: Control Compose
"""
if not self._initialized:
await self.initialize()
chunk.compute_hash()
result = PipelineResult(chunk=chunk)
try:
# Stage 4: Obligation Extract
result.obligation = await self._extractor.extract(
chunk_text=chunk.text,
regulation_code=chunk.regulation_code,
article=chunk.article,
paragraph=chunk.paragraph,
)
# Stage 5: Pattern Match
obligation_text = (
result.obligation.obligation_text
or result.obligation.obligation_title
or chunk.text[:500]
)
result.pattern_result = await self._matcher.match(
obligation_text=obligation_text,
regulation_id=result.obligation.regulation_id,
)
# Stage 6: Control Compose
result.control = await self._composer.compose(
obligation=result.obligation,
pattern_result=result.pattern_result,
chunk_text=chunk.text if chunk.license_rule in (1, 2) else None,
license_rule=chunk.license_rule,
source_citation=chunk.source_citation,
regulation_code=chunk.regulation_code,
)
except Exception as e:
logger.error("Pipeline processing failed: %s", e)
result.error = str(e)
return result
async def process_batch(self, chunks: list[PipelineChunk]) -> list[PipelineResult]:
"""Process multiple chunks through the pipeline."""
results = []
for chunk in chunks:
result = await self.process_chunk(chunk)
results.append(result)
return results
def write_crosswalk(self, result: PipelineResult, control_uuid: str) -> bool:
"""Write obligation_extraction + crosswalk_matrix rows for a processed chunk.
Called AFTER the control is stored in canonical_controls.
"""
if not self.db or not result.control:
return False
chunk = result.chunk
obligation = result.obligation
pattern = result.pattern_result
try:
# 1. Write obligation_extraction row
self.db.execute(
text("""
INSERT INTO obligation_extractions (
chunk_hash, collection, regulation_code,
article, paragraph, obligation_id,
obligation_text, confidence, extraction_method,
pattern_id, pattern_match_score, control_uuid
) VALUES (
:chunk_hash, :collection, :regulation_code,
:article, :paragraph, :obligation_id,
:obligation_text, :confidence, :extraction_method,
:pattern_id, :pattern_match_score,
CAST(:control_uuid AS uuid)
)
"""),
{
"chunk_hash": chunk.chunk_hash,
"collection": chunk.collection,
"regulation_code": chunk.regulation_code,
"article": chunk.article,
"paragraph": chunk.paragraph,
"obligation_id": obligation.obligation_id if obligation else None,
"obligation_text": (
obligation.obligation_text[:2000]
if obligation and obligation.obligation_text
else None
),
"confidence": obligation.confidence if obligation else 0,
"extraction_method": obligation.method if obligation else "none",
"pattern_id": pattern.pattern_id if pattern else None,
"pattern_match_score": pattern.confidence if pattern else 0,
"control_uuid": control_uuid,
},
)
# 2. Write crosswalk_matrix row
self.db.execute(
text("""
INSERT INTO crosswalk_matrix (
regulation_code, article, paragraph,
obligation_id, pattern_id,
master_control_id, master_control_uuid,
confidence, source
) VALUES (
:regulation_code, :article, :paragraph,
:obligation_id, :pattern_id,
:master_control_id,
CAST(:master_control_uuid AS uuid),
:confidence, :source
)
"""),
{
"regulation_code": chunk.regulation_code,
"article": chunk.article,
"paragraph": chunk.paragraph,
"obligation_id": obligation.obligation_id if obligation else None,
"pattern_id": pattern.pattern_id if pattern else None,
"master_control_id": result.control.control_id,
"master_control_uuid": control_uuid,
"confidence": min(
obligation.confidence if obligation else 0,
pattern.confidence if pattern else 0,
),
"source": "auto",
},
)
# 3. Update canonical_controls with pattern_id + obligation_ids
if result.control.pattern_id or result.control.obligation_ids:
self.db.execute(
text("""
UPDATE canonical_controls
SET pattern_id = COALESCE(:pattern_id, pattern_id),
obligation_ids = COALESCE(:obligation_ids, obligation_ids)
WHERE id = CAST(:control_uuid AS uuid)
"""),
{
"pattern_id": result.control.pattern_id,
"obligation_ids": json.dumps(result.control.obligation_ids),
"control_uuid": control_uuid,
},
)
self.db.commit()
result.crosswalk_written = True
return True
except Exception as e:
logger.error("Failed to write crosswalk: %s", e)
self.db.rollback()
return False
def stats(self) -> dict:
"""Return component statistics."""
return {
"extractor": self._extractor.stats(),
"matcher": self._matcher.stats(),
"initialized": self._initialized,
}
# ---------------------------------------------------------------------------
# Migration Passes — Backfill existing 4,800+ controls
# ---------------------------------------------------------------------------
class MigrationPasses:
"""Non-destructive migration passes for existing controls.
Pass 1: Obligation Linkage (deterministic, articleobligation lookup)
Pass 2: Pattern Classification (keyword-based matching)
Pass 3: Quality Triage (categorize by linkage completeness)
Pass 4: Crosswalk Backfill (write crosswalk rows for linked controls)
Pass 5: Deduplication (mark duplicate controls)
Usage::
migration = MigrationPasses(db)
await migration.initialize()
result = await migration.run_pass1_obligation_linkage(limit=100)
result = await migration.run_pass2_pattern_classification(limit=100)
result = migration.run_pass3_quality_triage()
result = migration.run_pass4_crosswalk_backfill()
result = migration.run_pass5_deduplication()
"""
def __init__(self, db: Session):
self.db = db
self._extractor = ObligationExtractor()
self._matcher = PatternMatcher()
self._initialized = False
async def initialize(self) -> None:
"""Initialize extractors (loads obligations + patterns)."""
if self._initialized:
return
self._extractor._load_obligations()
self._matcher._load_patterns()
self._matcher._build_keyword_index()
self._initialized = True
# -------------------------------------------------------------------
# Pass 1: Obligation Linkage (deterministic)
# -------------------------------------------------------------------
async def run_pass1_obligation_linkage(self, limit: int = 0) -> dict:
"""Link existing controls to obligations via source_citation article.
For each control with source_citation extract regulation + article
look up in obligation framework set obligation_ids.
"""
if not self._initialized:
await self.initialize()
query = """
SELECT id, control_id, source_citation, generation_metadata
FROM canonical_controls
WHERE release_state NOT IN ('deprecated')
AND (obligation_ids IS NULL OR obligation_ids = '[]')
"""
if limit > 0:
query += f" LIMIT {limit}"
rows = self.db.execute(text(query)).fetchall()
stats = {"total": len(rows), "linked": 0, "no_match": 0, "no_citation": 0}
for row in rows:
control_uuid = str(row[0])
control_id = row[1]
citation = row[2]
metadata = row[3]
# Extract regulation + article from citation or metadata
reg_code, article = _extract_regulation_article(citation, metadata)
if not reg_code:
stats["no_citation"] += 1
continue
# Tier 1: Exact match
match = self._extractor._tier1_exact(reg_code, article or "")
if match and match.obligation_id:
self.db.execute(
text("""
UPDATE canonical_controls
SET obligation_ids = :obl_ids
WHERE id = CAST(:uuid AS uuid)
"""),
{
"obl_ids": json.dumps([match.obligation_id]),
"uuid": control_uuid,
},
)
stats["linked"] += 1
else:
stats["no_match"] += 1
self.db.commit()
logger.info("Pass 1: %s", stats)
return stats
# -------------------------------------------------------------------
# Pass 2: Pattern Classification (keyword-based)
# -------------------------------------------------------------------
async def run_pass2_pattern_classification(self, limit: int = 0) -> dict:
"""Classify existing controls into patterns via keyword matching.
For each control without pattern_id keyword-match title+objective
against pattern library assign best match.
"""
if not self._initialized:
await self.initialize()
query = """
SELECT id, control_id, title, objective
FROM canonical_controls
WHERE release_state NOT IN ('deprecated')
AND (pattern_id IS NULL OR pattern_id = '')
"""
if limit > 0:
query += f" LIMIT {limit}"
rows = self.db.execute(text(query)).fetchall()
stats = {"total": len(rows), "classified": 0, "no_match": 0}
for row in rows:
control_uuid = str(row[0])
title = row[2] or ""
objective = row[3] or ""
# Keyword match
match_text = f"{title} {objective}"
result = self._matcher._tier1_keyword(match_text, None)
if result and result.pattern_id and result.keyword_hits >= 2:
self.db.execute(
text("""
UPDATE canonical_controls
SET pattern_id = :pattern_id
WHERE id = CAST(:uuid AS uuid)
"""),
{
"pattern_id": result.pattern_id,
"uuid": control_uuid,
},
)
stats["classified"] += 1
else:
stats["no_match"] += 1
self.db.commit()
logger.info("Pass 2: %s", stats)
return stats
# -------------------------------------------------------------------
# Pass 3: Quality Triage
# -------------------------------------------------------------------
def run_pass3_quality_triage(self) -> dict:
"""Categorize controls by linkage completeness.
Sets generation_metadata.triage_status:
- "review": has both obligation_id + pattern_id
- "needs_obligation": has pattern_id but no obligation_id
- "needs_pattern": has obligation_id but no pattern_id
- "legacy_unlinked": has neither
"""
categories = {
"review": """
UPDATE canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{triage_status}', '"review"'
)
WHERE release_state NOT IN ('deprecated')
AND obligation_ids IS NOT NULL AND obligation_ids != '[]'
AND pattern_id IS NOT NULL AND pattern_id != ''
""",
"needs_obligation": """
UPDATE canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{triage_status}', '"needs_obligation"'
)
WHERE release_state NOT IN ('deprecated')
AND (obligation_ids IS NULL OR obligation_ids = '[]')
AND pattern_id IS NOT NULL AND pattern_id != ''
""",
"needs_pattern": """
UPDATE canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{triage_status}', '"needs_pattern"'
)
WHERE release_state NOT IN ('deprecated')
AND obligation_ids IS NOT NULL AND obligation_ids != '[]'
AND (pattern_id IS NULL OR pattern_id = '')
""",
"legacy_unlinked": """
UPDATE canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{triage_status}', '"legacy_unlinked"'
)
WHERE release_state NOT IN ('deprecated')
AND (obligation_ids IS NULL OR obligation_ids = '[]')
AND (pattern_id IS NULL OR pattern_id = '')
""",
}
stats = {}
for category, sql in categories.items():
result = self.db.execute(text(sql))
stats[category] = result.rowcount
self.db.commit()
logger.info("Pass 3: %s", stats)
return stats
# -------------------------------------------------------------------
# Pass 4: Crosswalk Backfill
# -------------------------------------------------------------------
def run_pass4_crosswalk_backfill(self) -> dict:
"""Create crosswalk_matrix rows for controls with obligation + pattern.
Only creates rows that don't already exist.
"""
result = self.db.execute(text("""
INSERT INTO crosswalk_matrix (
regulation_code, obligation_id, pattern_id,
master_control_id, master_control_uuid,
confidence, source
)
SELECT
COALESCE(
(generation_metadata::jsonb->>'source_regulation'),
''
) AS regulation_code,
obl.value::text AS obligation_id,
cc.pattern_id,
cc.control_id,
cc.id,
0.80,
'migrated'
FROM canonical_controls cc,
jsonb_array_elements_text(
COALESCE(cc.obligation_ids::jsonb, '[]'::jsonb)
) AS obl(value)
WHERE cc.release_state NOT IN ('deprecated')
AND cc.pattern_id IS NOT NULL AND cc.pattern_id != ''
AND cc.obligation_ids IS NOT NULL AND cc.obligation_ids != '[]'
AND NOT EXISTS (
SELECT 1 FROM crosswalk_matrix cw
WHERE cw.master_control_uuid = cc.id
AND cw.obligation_id = obl.value::text
)
"""))
rows_inserted = result.rowcount
self.db.commit()
logger.info("Pass 4: %d crosswalk rows inserted", rows_inserted)
return {"rows_inserted": rows_inserted}
# -------------------------------------------------------------------
# Pass 5: Deduplication
# -------------------------------------------------------------------
def run_pass5_deduplication(self) -> dict:
"""Mark duplicate controls (same obligation + same pattern).
Groups controls by (obligation_id, pattern_id), keeps the one with
highest evidence_confidence (or newest), marks rest as deprecated.
"""
# Find groups with duplicates
groups = self.db.execute(text("""
SELECT cc.pattern_id,
obl.value::text AS obligation_id,
array_agg(cc.id ORDER BY cc.evidence_confidence DESC NULLS LAST, cc.created_at DESC) AS ids,
count(*) AS cnt
FROM canonical_controls cc,
jsonb_array_elements_text(
COALESCE(cc.obligation_ids::jsonb, '[]'::jsonb)
) AS obl(value)
WHERE cc.release_state NOT IN ('deprecated')
AND cc.pattern_id IS NOT NULL AND cc.pattern_id != ''
GROUP BY cc.pattern_id, obl.value::text
HAVING count(*) > 1
""")).fetchall()
stats = {"groups_found": len(groups), "controls_deprecated": 0}
for group in groups:
ids = group[2] # Array of UUIDs, first is the keeper
if len(ids) <= 1:
continue
# Keep first (highest confidence), deprecate rest
deprecate_ids = ids[1:]
for dep_id in deprecate_ids:
self.db.execute(
text("""
UPDATE canonical_controls
SET release_state = 'deprecated',
generation_metadata = jsonb_set(
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
'{deprecated_reason}', '"duplicate_same_obligation_pattern"'
)
WHERE id = CAST(:uuid AS uuid)
AND release_state != 'deprecated'
"""),
{"uuid": str(dep_id)},
)
stats["controls_deprecated"] += 1
self.db.commit()
logger.info("Pass 5: %s", stats)
return stats
def migration_status(self) -> dict:
"""Return overall migration progress."""
row = self.db.execute(text("""
SELECT
count(*) AS total,
count(*) FILTER (WHERE obligation_ids IS NOT NULL AND obligation_ids != '[]') AS has_obligation,
count(*) FILTER (WHERE pattern_id IS NOT NULL AND pattern_id != '') AS has_pattern,
count(*) FILTER (
WHERE obligation_ids IS NOT NULL AND obligation_ids != '[]'
AND pattern_id IS NOT NULL AND pattern_id != ''
) AS fully_linked,
count(*) FILTER (WHERE release_state = 'deprecated') AS deprecated
FROM canonical_controls
""")).fetchone()
return {
"total_controls": row[0],
"has_obligation": row[1],
"has_pattern": row[2],
"fully_linked": row[3],
"deprecated": row[4],
"coverage_obligation_pct": round(row[1] / max(row[0], 1) * 100, 1),
"coverage_pattern_pct": round(row[2] / max(row[0], 1) * 100, 1),
"coverage_full_pct": round(row[3] / max(row[0], 1) * 100, 1),
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _extract_regulation_article(
citation: Optional[str], metadata: Optional[str]
) -> tuple[Optional[str], Optional[str]]:
"""Extract regulation_code and article from control's citation/metadata."""
from compliance.services.obligation_extractor import _normalize_regulation
reg_code = None
article = None
# Try citation first (JSON string or dict)
if citation:
try:
c = json.loads(citation) if isinstance(citation, str) else citation
if isinstance(c, dict):
article = c.get("article") or c.get("source_article")
# Try to get regulation from source field
source = c.get("source", "")
if source:
reg_code = _normalize_regulation(source)
except (json.JSONDecodeError, TypeError):
pass
# Try metadata
if metadata and not reg_code:
try:
m = json.loads(metadata) if isinstance(metadata, str) else metadata
if isinstance(m, dict):
src_reg = m.get("source_regulation", "")
if src_reg:
reg_code = _normalize_regulation(src_reg)
if not article:
article = m.get("source_article")
except (json.JSONDecodeError, TypeError):
pass
return reg_code, article
@@ -33,6 +33,7 @@ class RAGSearchResult:
paragraph: str
source_url: str
score: float
collection: str = ""
class ComplianceRAGClient:
@@ -91,6 +92,7 @@ class ComplianceRAGClient:
paragraph=r.get("paragraph", ""),
source_url=r.get("source_url", ""),
score=r.get("score", 0.0),
collection=collection,
))
return results
@@ -98,6 +100,88 @@ class ComplianceRAGClient:
logger.warning("RAG search failed: %s", e)
return []
async def search_with_rerank(
self,
query: str,
collection: str = "bp_compliance_ce",
regulations: Optional[List[str]] = None,
top_k: int = 5,
) -> List[RAGSearchResult]:
"""
Search with optional cross-encoder re-ranking.
Fetches top_k*4 results from RAG, then re-ranks with cross-encoder
and returns top_k. Falls back to regular search if reranker is disabled.
"""
from .reranker import get_reranker
reranker = get_reranker()
if reranker is None:
return await self.search(query, collection, regulations, top_k)
# Fetch more candidates for re-ranking
candidates = await self.search(
query, collection, regulations, top_k=max(top_k * 4, 20)
)
if not candidates:
return []
texts = [c.text for c in candidates]
try:
ranked_indices = reranker.rerank(query, texts, top_k=top_k)
return [candidates[i] for i in ranked_indices]
except Exception as e:
logger.warning("Reranking failed, returning unranked: %s", e)
return candidates[:top_k]
async def scroll(
self,
collection: str,
offset: Optional[str] = None,
limit: int = 100,
) -> tuple[List[RAGSearchResult], Optional[str]]:
"""
Scroll through ALL chunks in a collection (paginated).
Returns (chunks, next_offset). next_offset is None when done.
"""
scroll_url = self._search_url.replace("/search", "/scroll")
params = {"collection": collection, "limit": str(limit)}
if offset:
params["offset"] = offset
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.get(scroll_url, params=params)
if resp.status_code != 200:
logger.warning(
"RAG scroll returned %d: %s", resp.status_code, resp.text[:200]
)
return [], None
data = resp.json()
results = []
for r in data.get("chunks", []):
results.append(RAGSearchResult(
text=r.get("text", ""),
regulation_code=r.get("regulation_code", ""),
regulation_name=r.get("regulation_name", ""),
regulation_short=r.get("regulation_short", ""),
category=r.get("category", ""),
article=r.get("article", ""),
paragraph=r.get("paragraph", ""),
source_url=r.get("source_url", ""),
score=0.0,
collection=collection,
))
next_offset = data.get("next_offset") or None
return results, next_offset
except Exception as e:
logger.warning("RAG scroll failed: %s", e)
return [], None
def format_for_prompt(
self, results: List[RAGSearchResult], max_results: int = 5
) -> str:
@@ -0,0 +1,85 @@
"""
Cross-Encoder Re-Ranking for RAG Search Results.
Uses BGE Reranker v2 (BAAI/bge-reranker-v2-m3, MIT license) to re-rank
search results from Qdrant for improved retrieval quality.
Lazy-loads the model on first use. Disabled by default (RERANK_ENABLED=false).
"""
import logging
import os
from typing import Optional
logger = logging.getLogger(__name__)
RERANK_ENABLED = os.getenv("RERANK_ENABLED", "false").lower() == "true"
RERANK_MODEL = os.getenv("RERANK_MODEL", "BAAI/bge-reranker-v2-m3")
class Reranker:
"""Cross-encoder reranker using sentence-transformers."""
def __init__(self, model_name: str = RERANK_MODEL):
self._model = None # Lazy init
self._model_name = model_name
def _ensure_model(self) -> None:
"""Load model on first use."""
if self._model is not None:
return
try:
from sentence_transformers import CrossEncoder
logger.info("Loading reranker model: %s", self._model_name)
self._model = CrossEncoder(self._model_name)
logger.info("Reranker model loaded successfully")
except ImportError:
logger.error(
"sentence-transformers not installed. "
"Install with: pip install sentence-transformers"
)
raise
except Exception as e:
logger.error("Failed to load reranker model: %s", e)
raise
def rerank(
self, query: str, texts: list[str], top_k: int = 5
) -> list[int]:
"""
Return indices of top_k texts sorted by relevance (highest first).
Args:
query: The search query.
texts: List of candidate texts to re-rank.
top_k: Number of top results to return.
Returns:
List of indices into the original texts list, sorted by relevance.
"""
if not texts:
return []
self._ensure_model()
pairs = [[query, text] for text in texts]
scores = self._model.predict(pairs)
# Sort by score descending, return indices
ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
return ranked[:top_k]
# Module-level singleton
_reranker: Optional[Reranker] = None
def get_reranker() -> Optional[Reranker]:
"""Get the shared reranker instance. Returns None if disabled."""
global _reranker
if not RERANK_ENABLED:
return None
if _reranker is None:
_reranker = Reranker()
return _reranker
@@ -0,0 +1,331 @@
"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations.
Finds regulatory coverage for v1 controls (generation_strategy='ungrouped',
pipeline_version=1, no source_citation) by embedding similarity search.
Reuses embedding + Qdrant helpers from control_dedup.py.
"""
import logging
from typing import Optional
from sqlalchemy import text
from database import SessionLocal
from compliance.services.control_dedup import (
get_embedding,
qdrant_search_cross_regulation,
)
logger = logging.getLogger(__name__)
# Similarity threshold — lower than dedup (0.85) since we want informational matches
# Typical top scores for v1 controls are 0.70-0.77
V1_MATCH_THRESHOLD = 0.70
V1_MAX_MATCHES = 5
def _is_eigenentwicklung_query() -> str:
"""SQL WHERE clause identifying v1 Eigenentwicklung controls."""
return """
generation_strategy = 'ungrouped'
AND (pipeline_version = '1' OR pipeline_version IS NULL)
AND source_citation IS NULL
AND parent_control_uuid IS NULL
AND release_state NOT IN ('rejected', 'merged', 'deprecated')
"""
async def count_v1_controls() -> int:
"""Count how many v1 Eigenentwicklung controls exist."""
with SessionLocal() as db:
row = db.execute(text(f"""
SELECT COUNT(*) AS cnt
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
return row.cnt if row else 0
async def enrich_v1_matches(
dry_run: bool = True,
batch_size: int = 100,
offset: int = 0,
) -> dict:
"""Find regulatory matches for v1 Eigenentwicklung controls.
Args:
dry_run: If True, only count don't write matches.
batch_size: Number of v1 controls to process per call.
offset: Pagination offset (v1 control index).
Returns:
Stats dict with counts, sample matches, and pagination info.
"""
with SessionLocal() as db:
# 1. Load v1 controls (paginated)
v1_controls = db.execute(text(f"""
SELECT id, control_id, title, objective, category
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
ORDER BY control_id
LIMIT :limit OFFSET :offset
"""), {"limit": batch_size, "offset": offset}).fetchall()
# Count total for pagination
total_row = db.execute(text(f"""
SELECT COUNT(*) AS cnt
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
total_v1 = total_row.cnt if total_row else 0
if not v1_controls:
return {
"dry_run": dry_run,
"processed": 0,
"total_v1": total_v1,
"message": "Kein weiterer Batch — alle v1 Controls verarbeitet.",
}
if dry_run:
return {
"dry_run": True,
"total_v1": total_v1,
"offset": offset,
"batch_size": batch_size,
"sample_controls": [
{
"control_id": r.control_id,
"title": r.title,
"category": r.category,
}
for r in v1_controls[:20]
],
}
# 2. Process each v1 control
processed = 0
matches_inserted = 0
errors = []
sample_matches = []
for v1 in v1_controls:
try:
# Build search text
search_text = f"{v1.title}{v1.objective}"
# Get embedding
embedding = await get_embedding(search_text)
if not embedding:
errors.append({
"control_id": v1.control_id,
"error": "Embedding fehlgeschlagen",
})
continue
# Search Qdrant (cross-regulation, no pattern filter)
# Collection is atomic_controls_dedup (contains ~51k atomare Controls)
results = await qdrant_search_cross_regulation(
embedding, top_k=20,
collection="atomic_controls_dedup",
)
# For each hit: resolve to a regulatory parent with source_citation.
# Atomic controls in Qdrant usually have parent_control_uuid → parent
# has the source_citation. We deduplicate by parent to avoid
# listing the same regulation multiple times.
rank = 0
seen_parents: set[str] = set()
for hit in results:
score = hit.get("score", 0)
if score < V1_MATCH_THRESHOLD:
continue
payload = hit.get("payload", {})
matched_uuid = payload.get("control_uuid")
if not matched_uuid or matched_uuid == str(v1.id):
continue
# Try the matched control itself first, then its parent
matched_row = db.execute(text("""
SELECT c.id, c.control_id, c.title, c.source_citation,
c.severity, c.category, c.parent_control_uuid
FROM canonical_controls c
WHERE c.id = CAST(:uuid AS uuid)
"""), {"uuid": matched_uuid}).fetchone()
if not matched_row:
continue
# Resolve to regulatory control (one with source_citation)
reg_row = matched_row
if not reg_row.source_citation and reg_row.parent_control_uuid:
# Look up parent — the parent has the source_citation
parent_row = db.execute(text("""
SELECT id, control_id, title, source_citation,
severity, category, parent_control_uuid
FROM canonical_controls
WHERE id = CAST(:uuid AS uuid)
AND source_citation IS NOT NULL
"""), {"uuid": str(reg_row.parent_control_uuid)}).fetchone()
if parent_row:
reg_row = parent_row
if not reg_row.source_citation:
continue
# Deduplicate by parent UUID
parent_key = str(reg_row.id)
if parent_key in seen_parents:
continue
seen_parents.add(parent_key)
rank += 1
if rank > V1_MAX_MATCHES:
break
# Extract source info
source_citation = reg_row.source_citation or {}
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
# Insert match — link to the regulatory parent (not the atomic child)
db.execute(text("""
INSERT INTO v1_control_matches
(v1_control_uuid, matched_control_uuid, similarity_score,
match_rank, matched_source, matched_article, match_method)
VALUES
(CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score,
:rank, :source, :article, 'embedding')
ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE
SET similarity_score = EXCLUDED.similarity_score,
match_rank = EXCLUDED.match_rank
"""), {
"v1_uuid": str(v1.id),
"matched_uuid": str(reg_row.id),
"score": round(score, 3),
"rank": rank,
"source": matched_source,
"article": matched_article,
})
matches_inserted += 1
# Collect sample
if len(sample_matches) < 20:
sample_matches.append({
"v1_control_id": v1.control_id,
"v1_title": v1.title,
"matched_control_id": reg_row.control_id,
"matched_title": reg_row.title,
"matched_source": matched_source,
"matched_article": matched_article,
"similarity_score": round(score, 3),
"match_rank": rank,
})
processed += 1
except Exception as e:
logger.warning("V1 enrichment error for %s: %s", v1.control_id, e)
errors.append({
"control_id": v1.control_id,
"error": str(e),
})
db.commit()
# Pagination
next_offset = offset + batch_size if len(v1_controls) == batch_size else None
return {
"dry_run": False,
"offset": offset,
"batch_size": batch_size,
"next_offset": next_offset,
"total_v1": total_v1,
"processed": processed,
"matches_inserted": matches_inserted,
"errors": errors[:10],
"sample_matches": sample_matches,
}
async def get_v1_matches(control_uuid: str) -> list[dict]:
"""Get all regulatory matches for a specific v1 control.
Args:
control_uuid: The UUID of the v1 control.
Returns:
List of match dicts with control details.
"""
with SessionLocal() as db:
rows = db.execute(text("""
SELECT
m.similarity_score,
m.match_rank,
m.matched_source,
m.matched_article,
m.match_method,
c.control_id AS matched_control_id,
c.title AS matched_title,
c.objective AS matched_objective,
c.severity AS matched_severity,
c.category AS matched_category,
c.source_citation AS matched_source_citation
FROM v1_control_matches m
JOIN canonical_controls c ON c.id = m.matched_control_uuid
WHERE m.v1_control_uuid = CAST(:uuid AS uuid)
ORDER BY m.match_rank
"""), {"uuid": control_uuid}).fetchall()
return [
{
"matched_control_id": r.matched_control_id,
"matched_title": r.matched_title,
"matched_objective": r.matched_objective,
"matched_severity": r.matched_severity,
"matched_category": r.matched_category,
"matched_source": r.matched_source,
"matched_article": r.matched_article,
"matched_source_citation": r.matched_source_citation,
"similarity_score": float(r.similarity_score),
"match_rank": r.match_rank,
"match_method": r.match_method,
}
for r in rows
]
async def get_v1_enrichment_stats() -> dict:
"""Get overview stats for v1 enrichment."""
with SessionLocal() as db:
total_v1 = db.execute(text(f"""
SELECT COUNT(*) AS cnt FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
matched_v1 = db.execute(text(f"""
SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt
FROM v1_control_matches m
JOIN canonical_controls c ON c.id = m.v1_control_uuid
WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')}
""")).fetchone()
total_matches = db.execute(text("""
SELECT COUNT(*) AS cnt FROM v1_control_matches
""")).fetchone()
avg_score = db.execute(text("""
SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches
""")).fetchone()
return {
"total_v1_controls": total_v1.cnt if total_v1 else 0,
"v1_with_matches": matched_v1.cnt if matched_v1 else 0,
"v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0),
"total_matches": total_matches.cnt if total_matches else 0,
"avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None,
}
+18
View File
@@ -14,6 +14,12 @@ from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
# Configure root logging so all modules' logger.info() etc. are visible
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s:%(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
# Compliance-specific API routers
@@ -86,6 +92,18 @@ async def health():
}
@app.get("/debug/routers", tags=["system"])
async def debug_routers():
"""Diagnostic: show which sub-routers loaded and which failed."""
from compliance.api import _ROUTER_MODULES, _failed_routers, _loaded_count
return {
"total": len(_ROUTER_MODULES),
"loaded": _loaded_count,
"failed_count": len(_failed_routers),
"failed": _failed_routers,
}
# --- Compliance-specific Routers ---
# Consent (user-facing)
+11 -5
View File
@@ -79,11 +79,14 @@ def run_migrations():
logger.info("%d pending migrations (of %d total)", len(pending), len(migration_files))
failed = []
for migration_file in pending:
logger.info("Applying migration: %s", migration_file.name)
try:
sql = migration_file.read_text(encoding="utf-8")
# Execute the full SQL file as-is (supports BEGIN/COMMIT)
# Strip explicit BEGIN/COMMIT — we manage transactions ourselves
sql = re.sub(r'(?mi)^\s*BEGIN\s*;\s*$', '', sql)
sql = re.sub(r'(?mi)^\s*COMMIT\s*;\s*$', '', sql)
cursor.execute(sql)
raw_conn.commit()
# Record successful application
@@ -96,11 +99,14 @@ def run_migrations():
except Exception as e:
raw_conn.rollback()
logger.error(" FAILED: %s%s", migration_file.name, e)
raise RuntimeError(
f"Migration {migration_file.name} failed: {e}"
) from e
failed.append((migration_file.name, str(e)))
# Continue with remaining migrations instead of aborting
logger.info("All migrations applied successfully")
if failed:
names = ", ".join(f[0] for f in failed)
logger.error("Some migrations failed: %s", names)
else:
logger.info("All migrations applied successfully")
finally:
raw_conn.close()
@@ -2,7 +2,7 @@
-- Adds job tracking, chunk tracking, blocked sources, and extends canonical_controls
-- for the 3-license-rule system (free_use, citation_required, restricted).
BEGIN;
-- Transaction managed by migration_runner
-- =============================================================================
-- 1. Job-Tracking for Generator Runs
@@ -69,35 +69,21 @@ CREATE TABLE IF NOT EXISTS canonical_blocked_sources (
-- =============================================================================
-- 4. Extend canonical_controls: release_state + 3-rule columns
-- Safe: only runs if canonical_controls exists
-- =============================================================================
-- Expand release_state enum to include generator states
ALTER TABLE canonical_controls DROP CONSTRAINT IF EXISTS canonical_controls_release_state_check;
ALTER TABLE canonical_controls ADD CONSTRAINT canonical_controls_release_state_check
CHECK (release_state IN ('draft', 'review', 'approved', 'deprecated', 'needs_review', 'too_close', 'duplicate'));
-- License rule: 1 = free_use, 2 = citation_required, 3 = restricted
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
license_rule INTEGER DEFAULT NULL;
-- Original text from source (Rule 1+2 only; Rule 3 = always NULL)
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
source_original_text TEXT DEFAULT NULL;
-- Citation info (Rule 1+2 only; Rule 3 = always NULL)
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
source_citation JSONB DEFAULT NULL;
-- Whether source info may be shown to customers
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
customer_visible BOOLEAN DEFAULT true;
-- Generation metadata (internal only, never shown to customers)
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
generation_metadata JSONB DEFAULT NULL;
-- Index for filtering by license rule and customer visibility
CREATE INDEX IF NOT EXISTS idx_canonical_controls_license_rule ON canonical_controls(license_rule);
CREATE INDEX IF NOT EXISTS idx_canonical_controls_customer_visible ON canonical_controls(customer_visible);
COMMIT;
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'canonical_controls') THEN
ALTER TABLE canonical_controls DROP CONSTRAINT IF EXISTS canonical_controls_release_state_check;
ALTER TABLE canonical_controls ADD CONSTRAINT canonical_controls_release_state_check
CHECK (release_state IN ('draft', 'review', 'approved', 'deprecated', 'needs_review', 'too_close', 'duplicate'));
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS license_rule INTEGER DEFAULT NULL;
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS source_original_text TEXT DEFAULT NULL;
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS source_citation JSONB DEFAULT NULL;
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS customer_visible BOOLEAN DEFAULT true;
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS generation_metadata JSONB DEFAULT NULL;
CREATE INDEX IF NOT EXISTS idx_canonical_controls_license_rule ON canonical_controls(license_rule);
CREATE INDEX IF NOT EXISTS idx_canonical_controls_customer_visible ON canonical_controls(customer_visible);
END IF;
END $$;
@@ -0,0 +1,44 @@
-- Migration 047: Add verification_method and category to canonical_controls
-- verification_method: How a control is verified (code_review, document, tool, hybrid)
-- category: Thematic grouping for customer-facing filters
-- Safe: only alters canonical_controls if it exists
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'canonical_controls') THEN
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
verification_method VARCHAR(20) DEFAULT NULL
CHECK (verification_method IN ('code_review', 'document', 'tool', 'hybrid'));
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
category VARCHAR(50) DEFAULT NULL;
CREATE INDEX IF NOT EXISTS idx_cc_verification ON canonical_controls(verification_method);
CREATE INDEX IF NOT EXISTS idx_cc_category ON canonical_controls(category);
END IF;
END $$;
CREATE TABLE IF NOT EXISTS canonical_control_categories (
category_id VARCHAR(50) PRIMARY KEY,
label_de VARCHAR(100) NOT NULL,
label_en VARCHAR(100) NOT NULL,
sort_order INTEGER DEFAULT 0
);
INSERT INTO canonical_control_categories VALUES
('encryption', 'Verschluesselung & Kryptographie', 'Encryption & Cryptography', 1),
('authentication', 'Authentisierung & Zugriffskontrolle', 'Authentication & Access Control', 2),
('network', 'Netzwerksicherheit', 'Network Security', 3),
('data_protection', 'Datenschutz & Datensicherheit', 'Data Protection & Security', 4),
('logging', 'Logging & Monitoring', 'Logging & Monitoring', 5),
('incident', 'Vorfallmanagement', 'Incident Management', 6),
('continuity', 'Notfall & Wiederherstellung', 'Continuity & Recovery', 7),
('compliance', 'Compliance & Audit', 'Compliance & Audit', 8),
('supply_chain', 'Lieferkettenmanagement', 'Supply Chain Management', 9),
('physical', 'Physische Sicherheit', 'Physical Security', 10),
('personnel', 'Personal & Schulung', 'Personnel & Training', 11),
('application', 'Anwendungssicherheit', 'Application Security', 12),
('system', 'Systemhaertung & -betrieb', 'System Hardening & Operations', 13),
('risk', 'Risikomanagement', 'Risk Management', 14),
('governance', 'Sicherheitsorganisation', 'Security Governance', 15),
('hardware', 'Hardware & Plattformsicherheit', 'Hardware & Platform Security', 16),
('identity', 'Identitaetsmanagement', 'Identity Management', 17)
ON CONFLICT DO NOTHING;
@@ -0,0 +1,22 @@
-- 048: Expand processing_path CHECK constraint for new pipeline paths
-- New values: prefilter_skip, no_control, store_failed, error
-- Safe: only runs if the table exists (may not exist on all environments)
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'canonical_processed_chunks') THEN
ALTER TABLE canonical_processed_chunks
DROP CONSTRAINT IF EXISTS canonical_processed_chunks_processing_path_check;
ALTER TABLE canonical_processed_chunks
ADD CONSTRAINT canonical_processed_chunks_processing_path_check
CHECK (processing_path IN (
'structured',
'llm_reform',
'skipped',
'prefilter_skip',
'no_control',
'store_failed',
'error'
));
END IF;
END $$;
@@ -0,0 +1,13 @@
-- 049: Add target_audience field to canonical_controls
-- Distinguishes who a control is relevant for: enterprises, authorities, providers, or all.
-- Safe: only runs if the table exists (may not exist on all environments)
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'canonical_controls') THEN
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
target_audience VARCHAR(20) DEFAULT NULL
CHECK (target_audience IN ('enterprise', 'authority', 'provider', 'all'));
CREATE INDEX IF NOT EXISTS idx_cc_target_audience ON canonical_controls(target_audience);
END IF;
END $$;
@@ -0,0 +1,22 @@
-- Score Snapshots: Historical compliance score tracking
-- Migration 050
CREATE TABLE IF NOT EXISTS compliance_score_snapshots (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL,
project_id UUID,
score DECIMAL(5,2) NOT NULL,
controls_total INTEGER DEFAULT 0,
controls_pass INTEGER DEFAULT 0,
controls_partial INTEGER DEFAULT 0,
evidence_total INTEGER DEFAULT 0,
evidence_valid INTEGER DEFAULT 0,
risks_total INTEGER DEFAULT 0,
risks_high INTEGER DEFAULT 0,
snapshot_date DATE NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, project_id, snapshot_date)
);
CREATE INDEX IF NOT EXISTS idx_score_snap_tenant ON compliance_score_snapshots(tenant_id);
CREATE INDEX IF NOT EXISTS idx_score_snap_date ON compliance_score_snapshots(snapshot_date);
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,53 @@
-- Process Manager: Recurring compliance tasks with audit trail
-- Migration 052
CREATE TABLE compliance_process_tasks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL,
project_id UUID,
task_code VARCHAR(50) NOT NULL,
title VARCHAR(500) NOT NULL,
description TEXT,
category VARCHAR(50) NOT NULL
CHECK (category IN ('dsgvo','nis2','bsi','iso27001','ai_act','internal')),
priority VARCHAR(20) NOT NULL DEFAULT 'medium'
CHECK (priority IN ('critical','high','medium','low')),
frequency VARCHAR(20) NOT NULL DEFAULT 'yearly'
CHECK (frequency IN ('weekly','monthly','quarterly','semi_annual','yearly','once')),
assigned_to VARCHAR(255),
responsible_team VARCHAR(255),
linked_control_ids JSONB DEFAULT '[]',
linked_module VARCHAR(100),
last_completed_at TIMESTAMPTZ,
next_due_date DATE,
due_reminder_days INTEGER DEFAULT 14,
status VARCHAR(20) NOT NULL DEFAULT 'pending'
CHECK (status IN ('pending','in_progress','completed','overdue','skipped')),
completion_date TIMESTAMPTZ,
completion_result TEXT,
completion_evidence_id UUID,
follow_up_actions JSONB DEFAULT '[]',
is_seed BOOLEAN DEFAULT FALSE,
notes TEXT,
tags JSONB DEFAULT '[]',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, project_id, task_code)
);
CREATE TABLE compliance_process_task_history (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
task_id UUID NOT NULL REFERENCES compliance_process_tasks(id) ON DELETE CASCADE,
completed_by VARCHAR(255),
completed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
result TEXT,
evidence_id UUID,
notes TEXT,
status VARCHAR(20) NOT NULL
);
CREATE INDEX idx_process_tasks_tenant ON compliance_process_tasks(tenant_id);
CREATE INDEX idx_process_tasks_status ON compliance_process_tasks(status);
CREATE INDEX idx_process_tasks_due ON compliance_process_tasks(next_due_date);
CREATE INDEX idx_process_tasks_category ON compliance_process_tasks(category);
CREATE INDEX idx_task_history_task ON compliance_process_task_history(task_id);
@@ -0,0 +1,62 @@
-- Evidence Checks: Automated compliance verification
-- Migration 053
CREATE TABLE compliance_evidence_checks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL,
project_id UUID,
check_code VARCHAR(50) NOT NULL,
title VARCHAR(500) NOT NULL,
description TEXT,
check_type VARCHAR(30) NOT NULL
CHECK (check_type IN ('tls_scan','header_check','certificate_check',
'config_scan','api_scan','dns_check','port_scan')),
target_url TEXT,
target_config JSONB DEFAULT '{}',
linked_control_ids JSONB DEFAULT '[]',
frequency VARCHAR(20) DEFAULT 'monthly'
CHECK (frequency IN ('daily','weekly','monthly','quarterly','manual')),
last_run_at TIMESTAMPTZ,
next_run_at TIMESTAMPTZ,
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, project_id, check_code)
);
CREATE TABLE compliance_evidence_check_results (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
check_id UUID NOT NULL REFERENCES compliance_evidence_checks(id) ON DELETE CASCADE,
tenant_id UUID NOT NULL,
run_status VARCHAR(20) NOT NULL DEFAULT 'running'
CHECK (run_status IN ('running','passed','failed','warning','error')),
result_data JSONB NOT NULL DEFAULT '{}',
summary TEXT,
findings_count INTEGER DEFAULT 0,
critical_findings INTEGER DEFAULT 0,
evidence_id UUID,
duration_ms INTEGER,
run_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE compliance_evidence_control_map (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL,
evidence_id UUID NOT NULL,
control_code VARCHAR(50) NOT NULL,
mapping_type VARCHAR(20) DEFAULT 'supports'
CHECK (mapping_type IN ('supports','partially_supports','required')),
verified_at TIMESTAMPTZ,
verified_by VARCHAR(255),
notes TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, evidence_id, control_code)
);
CREATE INDEX idx_evidence_checks_tenant ON compliance_evidence_checks(tenant_id);
CREATE INDEX idx_evidence_checks_type ON compliance_evidence_checks(check_type);
CREATE INDEX idx_evidence_checks_active ON compliance_evidence_checks(is_active);
CREATE INDEX idx_check_results_check ON compliance_evidence_check_results(check_id);
CREATE INDEX idx_check_results_status ON compliance_evidence_check_results(run_status);
CREATE INDEX idx_evidence_control_map_tenant ON compliance_evidence_control_map(tenant_id);
CREATE INDEX idx_evidence_control_map_control ON compliance_evidence_control_map(control_code);
@@ -0,0 +1,340 @@
-- Migration 054: Erweiterte HinSchG-Wiki-Artikel
-- Ergaenzt die bestehende HinSchG-Kategorie um detaillierte Artikel
-- Bestehenden Grundlagen-Artikel mit umfassenderem Inhalt aktualisieren
UPDATE compliance_wiki_articles
SET content = '## Ueberblick
Das **Hinweisgeberschutzgesetz (HinSchG)** setzt die EU-Whistleblowing-Richtlinie (EU) 2019/1937 in deutsches Recht um. Es schuetzt Personen, die auf Missstaende in Unternehmen und Behoerden hinweisen und ist seit dem **2. Juli 2023** in Kraft.
- Ab 02.07.2023: Pflicht fuer Unternehmen ab **250 Beschaeftigten**
- Ab 17.12.2023: Pflicht fuer Unternehmen ab **50 Beschaeftigten** (§ 12 HinSchG)
## Kernpflichten
### Interne Meldestelle einrichten (§ 12 HinSchG)
- Kann eine **interne Person** (Ombudsperson) oder ein **externer Dienstleister** sein
- Meldungen muessen **muendlich, schriftlich und persoenlich** moeglich sein
- Die Meldestelle muss **unabhaengig** und **fachkundig** sein
- **Gemeinsame Meldestellen** sind fuer Unternehmen mit 50249 Beschaeftigten zulaessig
### Gesetzliche Fristen (§ 17 HinSchG)
- Eingangsbestaetigung innerhalb von **7 Tagen** nach Meldungseingang (§ 17 Abs. 1 S. 2)
- Rueckmeldung ueber Folgemaßnahmen innerhalb von **3 Monaten** nach Eingangsbestaetigung (§ 17 Abs. 2)
- Dokumentation muss **3 Jahre** nach Abschluss aufbewahrt werden (§ 11 Abs. 5)
### Vertraulichkeitsgebot (§ 8 HinSchG)
- Die **Identitaet des Hinweisgebers** darf nur den zustaendigen Personen bekannt sein
- Offenlegung nur mit **Einwilligung** oder bei **gesetzlicher Verpflichtung**
- Verstoss ist bussgeld-bewehrt (bis 50.000 EUR)
## Welche Daten fallen an?
- Identitaet des Hinweisgebers (besonders schuetzenswert!)
- Beschuldigte Personen
- Zeugen und weitere Beteiligte
- Inhalt der Meldung (kann sensible Daten enthalten)
- Kommunikationsverlauf
## Datenschutz-Anforderungen
- **Eigene Verarbeitungstaetigkeit** im VVT anlegen
- Rechtsgrundlage: Art. 6 Abs. 1c DSGVO (rechtliche Verpflichtung)
- **Zugriffsbeschraenkung:** Nur die benannte Meldestelle darf auf die Daten zugreifen
- **Loeschfrist:** 3 Jahre nach Abschluss des Verfahrens (§ 11 Abs. 5 HinSchG)
- Bei Art.-9-Daten in Meldungen: besondere Schutzmassnahmen erforderlich
## Sanktionen (§ 40 HinSchG)
| Verstoss | Bussgeld |
|----------|----------|
| Keine Meldestelle eingerichtet | Bis 20.000 EUR |
| Behinderung einer Meldung | Bis 50.000 EUR |
| Verstoss gegen Vertraulichkeitsgebot | Bis 50.000 EUR |
| Repressalien gegen Hinweisgeber | Bis 50.000 EUR |
## Praxis-Tipp
Pruefen Sie bei externen Meldestellen-Anbietern, ob ein **AVV** erforderlich ist. In den meisten Faellen ja der Anbieter verarbeitet personenbezogene Daten in Ihrem Auftrag.',
summary = 'Das HinSchG setzt die EU-Whistleblowing-Richtlinie um und verpflichtet seit Dezember 2023 alle Unternehmen ab 50 Beschaeftigten zur Einrichtung einer internen Meldestelle. Verstoesse koennen mit bis zu 50.000 EUR geahndet werden.',
legal_refs = ARRAY['§ 2 HinSchG', '§ 8 HinSchG', '§ 11 Abs. 5 HinSchG', '§ 12 HinSchG', '§ 17 HinSchG', '§ 36 HinSchG', '§ 40 HinSchG', 'Art. 6 Abs. 1c DSGVO', 'EU-RL 2019/1937'],
tags = ARRAY['hinweisgeberschutz', 'whistleblower', 'meldestelle', 'vertraulichkeit', 'fristen', 'bussgelder'],
version = 2,
updated_at = NOW()
WHERE id = 'hinschg-grundlagen';
-- Neuer Artikel: Sachlicher Anwendungsbereich
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('hinschg-anwendungsbereich', 'hinschg',
'Sachlicher Anwendungsbereich — Welche Verstoesse sind meldbar?',
'Das HinSchG schuetzt Meldungen ueber Verstoesse gegen EU-Recht und nationales Recht. Der Anwendungsbereich geht weit ueber rein strafrechtliche Verstoesse hinaus.',
'## Ueberblick
Der sachliche Anwendungsbereich des HinSchG (§ 2) ist bewusst weit gefasst. Geschuetzt werden Meldungen ueber Verstoesse, die **strafbewehrt** sind oder **bussgeld-bewehrt**, sowie Verstoesse gegen bestimmte **EU-Rechtsakte** und deren nationale Umsetzungsgesetze.
## Meldbare Verstoesse (§ 2 HinSchG)
### Strafvorschriften
- Alle Straftaten nach dem **StGB** (Betrug, Untreue, Korruption, Urkundenfaelschung)
- Straftaten nach **Nebenstrafrecht** (Umweltstrafrecht, Wirtschaftsstrafrecht)
### Bussgeld-bewehrte Vorschriften
- Verstoesse gegen **Ordnungswidrigkeiten-Vorschriften**, soweit die verletzte Norm dem Schutz von Leben, Leib, Gesundheit oder Rechten von Beschaeftigten dient
### EU-Rechtsakte und nationale Umsetzung
| Rechtsgebiet | Beispiele |
|-------------|-----------|
| Datenschutz | DSGVO, BDSG z.B. unrechtmaessige Datenweitergabe |
| Geldwaesche | GwG z.B. fehlende Verdachtsmeldungen |
| Produktsicherheit | ProdSG z.B. mangelhafte Produkte im Verkehr |
| Umweltschutz | BImSchG, KrWG z.B. illegale Entsorgung |
| Lebensmittelsicherheit | LFGB z.B. Hygienemaengel |
| Arbeitsschutz | ArbSchG, ArbZG z.B. ueberlange Arbeitszeiten |
| Verbraucherschutz | UWG z.B. irrefuehrende Werbung |
| Wettbewerbsrecht | GWB z.B. Preisabsprachen, Kartelle |
| Steuerrecht | AO z.B. Steuerhinterziehung bei Unternehmen |
| Vergaberecht | GWB Teil 4 z.B. Manipulationen bei oeffentlichen Auftraegen |
## Nicht erfasste Bereiche
- **Rein privatrechtliche Streitigkeiten** (z.B. Vertragskonflikte)
- **Nationale Sicherheit** Informationen, die der nationalen Sicherheit unterliegen
- **Berufsgeheimnisse** Anwalts-, Arzt- oder Seelsorgegeheimnis (mit Ausnahmen)
## Praxis-Tipp
Im Zweifelsfall sollte eine Meldung **immer entgegengenommen** und geprueft werden. Die Meldestelle entscheidet erst bei der Sachverhaltspruefung, ob ein meldepflichtiger Verstoss vorliegt.',
ARRAY['§ 2 HinSchG', '§ 3 HinSchG', '§ 5 HinSchG'],
ARRAY['anwendungsbereich', 'verstoesse', 'strafrecht', 'bussgeld', 'eu-recht', 'meldepflicht'],
'important',
ARRAY[]::text[])
ON CONFLICT (id) DO NOTHING;
-- Neuer Artikel: Schutz des Hinweisgebers
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('hinschg-hinweisgeberschutz', 'hinschg',
'Schutz des Hinweisgebers — Repressalienverbot und Beweislastumkehr',
'Das HinSchG verbietet jede Form der Benachteiligung von Hinweisgebern. Bei Verstoessen greift eine Beweislastumkehr zugunsten des Hinweisgebers.',
'## Ueberblick
Der Schutz hinweisgebender Personen ist das **Kernziel des HinSchG**. Das Gesetz sieht ein umfassendes Verbot von Repressalien, eine Beweislastumkehr und einen Schadensersatzanspruch vor.
## Repressalienverbot (§ 36 HinSchG)
Verboten ist jede Form der **Benachteiligung** aufgrund einer Meldung:
- **Kuendigung** oder Nichterneuerung eines befristeten Vertrags
- **Abmahnung** oder negative Leistungsbewertung
- **Versetzung**, Degradierung oder Befoerderungsverweigerung
- **Gehaltsreduktion** oder Entzug von Verguenstigungen
- **Mobbing**, Ausgrenzung, Einschuechterung
- **Aufnahme in schwarze Listen** oder Branchenregister
- **Entzug einer Lizenz** oder Genehmigung
- **Anordnung einer psychiatrischen Untersuchung**
## Beweislastumkehr (§ 36 Abs. 2 HinSchG)
Erleidet ein Hinweisgeber nach einer Meldung eine Benachteiligung, wird **vermutet**, dass diese Benachteiligung eine Repressalie ist. Der **Arbeitgeber** muss beweisen, dass die Massnahme:
- Auf hinreichend gerechtfertigten Gruenden beruht
- **Keinen Zusammenhang** mit der Meldung hat
## Schadensersatz (§ 37 HinSchG)
- Hinweisgeber hat Anspruch auf **Ersatz des erlittenen Schadens**
- Umfasst **materielle** Schaeden (Gehaltsverlust) und **immaterielle** Schaeden (Mobbing)
- Kein **Mitverschulden** des Hinweisgebers, wenn die Meldung in gutem Glauben erfolgte
## Geschuetzte Personengruppen (§ 1 HinSchG)
- Arbeitnehmerinnen und Arbeitnehmer
- Beamtinnen und Beamte
- Auszubildende und Praktikanten
- Selbststaendige und Anteilseigner
- Mitglieder von Leitungs- und Aufsichtsorganen
- Ehrenamtlich Taetige und Freiwillige
- Bewerberinnen und Bewerber (bei Informationen im Bewerbungsprozess)
## Voraussetzungen fuer den Schutz (§ 33 HinSchG)
Der Schutz greift, wenn der Hinweisgeber:
- **Hinreichenden Grund** hatte anzunehmen, dass die gemeldeten Informationen der Wahrheit entsprechen
- Die Meldung ueber einen **vorgesehenen Kanal** (intern oder extern) erfolgte
- Der Verstoß in den **sachlichen Anwendungsbereich** faellt
**Achtung:** Wissentlich **falsche Meldungen** sind nicht geschuetzt und koennen eigene Schadensersatzpflichten ausloesen (§ 38 HinSchG).',
ARRAY['§ 1 HinSchG', '§ 33 HinSchG', '§ 36 HinSchG', '§ 37 HinSchG', '§ 38 HinSchG'],
ARRAY['repressalienverbot', 'beweislastumkehr', 'schadensersatz', 'hinweisgeberschutz', 'kuendigungsschutz'],
'critical',
ARRAY[]::text[])
ON CONFLICT (id) DO NOTHING;
-- Neuer Artikel: Interne vs. Externe Meldestelle
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('hinschg-meldestellen', 'hinschg',
'Interne vs. Externe Meldestelle — Was ist der Unterschied?',
'Das HinSchG sieht interne und externe Meldestelllen vor. Hinweisgeber koennen frei waehlen, an wen sie sich wenden. Die Einrichtung einer internen Meldestelle ist Pflicht.',
'## Ueberblick
Das HinSchG unterscheidet zwischen **internen Meldestellen** (beim Unternehmen) und **externen Meldestellen** (bei Behoerden). Hinweisgeber haben ein **Wahlrecht** sie koennen sich direkt an die externe Meldestelle wenden, ohne den internen Weg vorher beschritten zu haben.
## Interne Meldestelle (§§ 1218 HinSchG)
### Einrichtungspflicht
- **Ab 50 Beschaeftigten**: Pflicht zur Einrichtung (seit 17.12.2023)
- Unternehmen mit **50249 Beschaeftigten** duerfen eine gemeinsame Meldestelle nutzen
- Ab **250 Beschaeftigten**: eigene Meldestelle erforderlich
### Anforderungen
- **Unabhaengigkeit** keine Interessenkonflikte
- **Fachkunde** geschultes Personal
- Meldekanal muss **muendliche, schriftliche und persoenliche** Meldungen ermoeglichen
- **Anonyme Meldungen** sollen ermoeglicht werden (keine Pflicht, aber empfohlen)
### Besetzung
Die Meldestelle kann besetzt werden durch:
- Interne **Ombudsperson** (Compliance Officer, Datenschutzbeauftragter in Personalunion kritisch)
- **Externer Dienstleister** (Kanzlei, spezialisierter Anbieter) erfordert AVV
- **Gremium** aus mehreren Personen
## Externe Meldestelle (§§ 1931 HinSchG)
Die wichtigsten externen Meldestellen:
| Meldestelle | Zustaendigkeit |
|-------------|---------------|
| **BfJ (Bundesamt fuer Justiz)** | Auffangmeldestelle fuer alle Verstoesse |
| **BaFin** | Finanzaufsicht, Geldwaesche, Wertpapierrecht |
| **Bundeskartellamt** | Wettbewerbsrecht, Kartelle |
## Wahlrecht des Hinweisgebers
- Hinweisgeber duerfen **frei waehlen** zwischen intern und extern
- Die interne Meldung ist **nicht vorrangig** anders als bei vielen Unternehmenspolicies
- Ein Unternehmen darf **nicht verbieten**, sich an die externe Stelle zu wenden
## Praxis-Tipp
Gestalten Sie die interne Meldestelle **niedrigschwellig und vertrauenswuerdig**, damit Mitarbeiter sie bevorzugt nutzen. Unternehmen erfahren frueh von Problemen und koennen schneller reagieren.',
ARRAY['§ 12 HinSchG', '§ 13 HinSchG', '§ 14 HinSchG', '§ 16 HinSchG', '§ 17 HinSchG', '§ 19 HinSchG', '§ 27 HinSchG'],
ARRAY['meldestelle', 'intern', 'extern', 'ombudsperson', 'bfj', 'bafin', 'wahlrecht'],
'critical',
ARRAY[]::text[])
ON CONFLICT (id) DO NOTHING;
-- Neuer Artikel: Verfahrensablauf bei einer Meldung
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('hinschg-verfahrensablauf', 'hinschg',
'Verfahrensablauf — Von der Meldung bis zur Rueckmeldung',
'Der gesetzlich vorgeschriebene Ablauf einer Meldung umfasst Eingangsbestaetigung, Sachverhaltspruefung, Folgemaßnahmen und Rueckmeldung an den Hinweisgeber.',
'## Ueberblick
Das HinSchG schreibt einen strukturierten Verfahrensablauf fuer jede eingehende Meldung vor (§ 17 HinSchG). Dieser Ablauf ist nicht verhandelbar die Fristen sind gesetzlich bindend.
## Schritt-fuer-Schritt-Verfahren
### 1. Meldungseingang
- Meldung wird ueber den internen Meldekanal eingereicht
- Das System vergibt automatisch eine **Referenznummer** und einen **Zugangscode**
- Der Zugangscode ermoeglicht dem Hinweisgeber die anonyme Statusabfrage
### 2. Eingangsbestaetigung (Frist: 7 Tage)
- Innerhalb von **7 Tagen** nach Eingang muss die Meldestelle den Eingang bestaetigen (§ 17 Abs. 1 S. 2)
- Bei anonymen Meldungen: Bestaetigung ueber den anonymen Kommunikationskanal
- **Wichtig:** Die Bestaetigung darf keine inhaltliche Bewertung enthalten
### 3. Sachverhaltspruefung
- Die Meldestelle prueft, ob ein **meldepflichtiger Verstoss** vorliegt (§ 2 HinSchG)
- Stichhaltigkeitspruefung der gemeldeten Informationen
- Gegebenenfalls Rueckfragen an den Hinweisgeber (ueber anonymen Kanal)
### 4. Folgemaßnahmen (§ 18 HinSchG)
Moegliche Maßnahmen umfassen:
- **Interne Untersuchung** (ggf. mit externen Gutachtern)
- **Abstellung des Verstosses** durch organisatorische Aenderungen
- Weiterleitung an eine **zustaendige Behoerde**
- **Disziplinarmaßnahmen** gegen Verantwortliche
- **Einstellung** des Verfahrens bei unbegruendeten Meldungen
### 5. Rueckmeldung (Frist: 3 Monate)
- Innerhalb von **3 Monaten** nach Eingangsbestaetigung muss dem Hinweisgeber eine Rueckmeldung ueber ergriffene oder geplante Folgemaßnahmen gegeben werden (§ 17 Abs. 2)
- Die Rueckmeldung soll den Hinweisgeber informieren, **ohne laufende Ermittlungen zu gefaehrden**
### 6. Abschluss und Dokumentation
- Abschließende Dokumentation des gesamten Verfahrens
- Aufbewahrung fuer **3 Jahre** nach Abschluss (§ 11 Abs. 5 HinSchG)
- Danach: Loeschung aller personenbezogenen Daten
## Fristen-Uebersicht
| Schritt | Frist | Ab wann |
|---------|-------|---------|
| Eingangsbestaetigung | 7 Tage | Ab Meldungseingang |
| Rueckmeldung | 3 Monate | Ab Eingangsbestaetigung |
| Aufbewahrung | 3 Jahre | Ab Verfahrensabschluss |
## Praxis-Tipp
Richten Sie ein **automatisches Fristen-Monitoring** ein. Das BreakPilot Hinweisgebersystem berechnet die Fristen automatisch und warnt rechtzeitig vor drohender Ueberschreitung.',
ARRAY['§ 11 Abs. 5 HinSchG', '§ 17 Abs. 1 HinSchG', '§ 17 Abs. 2 HinSchG', '§ 18 HinSchG'],
ARRAY['verfahren', 'ablauf', 'fristen', 'eingangsbestaetigung', 'rueckmeldung', 'folgemaßnahmen', 'dokumentation'],
'important',
ARRAY[]::text[])
ON CONFLICT (id) DO NOTHING;
-- Neuer Artikel: Datenschutz-Anforderungen
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('hinschg-datenschutz', 'hinschg',
'Datenschutz im Hinweisgebersystem — DSGVO-Konformitaet sicherstellen',
'Das Hinweisgebersystem verarbeitet besonders sensible personenbezogene Daten. Die DSGVO-Anforderungen an Datenschutz, Loeschfristen und Zugriffskontrollen sind strikt einzuhalten.',
'## Ueberblick
Ein Hinweisgebersystem verarbeitet **hochsensible personenbezogene Daten**: die Identitaet des Hinweisgebers, Beschuldigter, Zeugen und den Inhalt der Meldung. Die DSGVO-Anforderungen muessen mit den HinSchG-Pflichten in Einklang gebracht werden.
## Rechtsgrundlage
Die Verarbeitung stuetzt sich auf:
- **Art. 6 Abs. 1c DSGVO** Erfuellung einer rechtlichen Verpflichtung (HinSchG)
- **Art. 6 Abs. 1f DSGVO** Berechtigtes Interesse (fuer nicht-verpflichtete Unternehmen)
- **Art. 9 Abs. 2b DSGVO** Fuer besondere Datenkategorien im Beschaeftigungskontext
## VVT-Eintrag (Pflicht)
Erstellen Sie einen eigenen VVT-Eintrag fuer das Hinweisgebersystem:
| Feld | Inhalt |
|------|--------|
| Bezeichnung | Betrieb des internen Hinweisgebersystems |
| Rechtsgrundlage | Art. 6 Abs. 1c DSGVO i.V.m. §§ 12 ff. HinSchG |
| Kategorien betroffener Personen | Hinweisgeber, Beschuldigte, Zeugen |
| Datenkategorien | Identitaetsdaten, Kommunikationsdaten, Meldungsinhalt |
| Loeschfrist | 3 Jahre nach Verfahrensabschluss |
| Empfaenger | Interne Meldestelle, ggf. externe Meldestelle |
## Technisch-organisatorische Massnahmen (TOM)
- **Verschluesselung** Alle Meldungsdaten at-rest und in-transit verschluesselt
- **Zugriffsbeschraenkung** Nur die benannte Meldestelle darf auf Daten zugreifen
- **Protokollierung** Revisionssicherer Audit-Trail aller Zugriffe
- **Pseudonymisierung** Anonyme Meldungen ohne Zuordnung zu Klarnamen
- **Trennung** Meldungsdaten getrennt von sonstigen HR-Daten speichern
## Loeschkonzept
| Daten | Loeschfrist | Rechtsgrundlage |
|-------|-------------|-----------------|
| Meldungsdaten | 3 Jahre nach Abschluss | § 11 Abs. 5 HinSchG |
| Audit-Trail | 3 Jahre nach Abschluss | § 11 Abs. 5 HinSchG |
| Kommunikationsdaten | 3 Jahre nach Abschluss | § 11 Abs. 5 HinSchG |
| Zugangscodes | Nach Verfahrensabschluss | Zweckerfuellung |
## DSFA-Pflicht?
Eine **Datenschutz-Folgenabschaetzung** (Art. 35 DSGVO) ist in vielen Faellen erforderlich, da:
- **Systematische Ueberwachung** von Beschaeftigten (potenziell)
- Verarbeitung **besonderer Datenkategorien** moeglich (Art. 9 DSGVO)
- **Verletzliche Personengruppen** betroffen (Hinweisgeber, Beschuldigte)
## Praxis-Tipp
Fuehren Sie eine DSFA durch und dokumentieren Sie die Abwaegung. Dies dient auch als Nachweis der Rechenschaftspflicht (Art. 5 Abs. 2 DSGVO).',
ARRAY['Art. 5 Abs. 2 DSGVO', 'Art. 6 Abs. 1c DSGVO', 'Art. 9 Abs. 2b DSGVO', 'Art. 28 DSGVO', 'Art. 35 DSGVO', '§ 8 HinSchG', '§ 11 Abs. 5 HinSchG', '§ 26 BDSG'],
ARRAY['datenschutz', 'dsgvo', 'vvt', 'dsfa', 'loeschfristen', 'tom', 'verschluesselung', 'audit-trail'],
'critical',
ARRAY[]::text[])
ON CONFLICT (id) DO NOTHING;
@@ -0,0 +1,230 @@
-- Migration 055: CRA (Cyber Resilience Act) Wiki-Kategorie und Artikel
-- Neue Kategorie + 3 Artikel zum EU Cyber Resilience Act
-- Kategorie: CRA
INSERT INTO compliance_wiki_categories (id, name, description, icon, sort_order) VALUES
('cra', 'Cyber Resilience Act (CRA)', 'EU-Verordnung fuer Cybersicherheit von Produkten mit digitalen Elementen', 'Shield', 75)
ON CONFLICT (id) DO NOTHING;
-- Artikel 1: CRA Grundlagen
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('cra-grundlagen', 'cra',
'Cyber Resilience Act — Ueberblick und Pflichten',
'Der CRA (EU) 2024/2847 verpflichtet Hersteller von Produkten mit digitalen Elementen zu umfassenden Cybersicherheits-Massnahmen ueber den gesamten Produktlebenszyklus.',
'## Ueberblick
Der **EU Cyber Resilience Act (CRA)**, Verordnung (EU) 2024/2847, ist am **10. Dezember 2024** in Kraft getreten. Er etabliert horizontale Cybersicherheitsanforderungen fuer alle **Produkte mit digitalen Elementen**, die in der EU in Verkehr gebracht werden.
## Zeitplan
| Datum | Meilenstein |
|-------|------------|
| 10.12.2024 | Inkrafttreten |
| 11.06.2026 | Konformitaetsbewertungsstellen muessen benannt sein |
| 11.09.2026 | Meldepflicht fuer Schwachstellen und Vorfaelle |
| 11.12.2027 | Volle Anwendung CE-Kennzeichnung erforderlich |
## Was sind "Produkte mit digitalen Elementen"?
Jedes Software- oder Hardware-Produkt, das:
- Eine **Datenverbindung** (direkt oder indirekt) zu einem Geraet oder Netzwerk hat
- **Software** enthaelt, die bestimmungsgemaeß genutzt wird
**Beispiele:** IoT-Geraete, Firmware, eigenstaendige Software, Betriebssysteme, Router, Smart-Home-Geraete, industrielle Steuerungssysteme.
## Kernpflichten fuer Hersteller
### 1. Cybersecurity-Risikobewertung
- Systematische Bewertung der Cybersecurity-Risiken des Produkts
- Dokumentation der Risikoanalyse
- Regelmaessige Aktualisierung
### 2. Secure Development (SSDLC)
- Sichere Entwicklungsprozesse etablieren
- Code Reviews und Security Testing
- Supply-Chain-Security pruefen
### 3. Vulnerability Management
- Aktives CVE-Monitoring
- Coordinated Vulnerability Disclosure (CVD)
- Patch-Bereitstellung waehrend des gesamten Support-Zeitraums
### 4. Security Updates
- Sichere Update-Mechanismen (signiert, integritaetsgeprueft)
- Automatische oder einfache Update-Moeglichkeit fuer Nutzer
- Mindest-Support-Zeitraum: 5 Jahre oder erwartete Produktlebensdauer
### 5. Software Bill of Materials (SBOM)
- Dokumentation aller Software-Komponenten
- Top-Level-Abhaengigkeiten
- Maschinenlesbares Format
### 6. Incident Reporting
- **24 Stunden:** Fruehwarnung an ENISA/nationale Behoerde
- **72 Stunden:** Detaillierter Incident Report
- Meldepflicht fuer aktiv ausgenutzte Schwachstellen
## CE-Kennzeichnung
Der CRA wird Teil der **CE-Konformitaet**. Ab Dezember 2027 duerfen Produkte ohne Cybersecurity-Konformitaet **nicht mehr in der EU verkauft werden**.
## Sanktionen
| Verstoss | Bussgeld |
|----------|----------|
| Wesentliche Anforderungen (Annex I) | Bis 15 Mio. EUR oder 2,5% des Jahresumsatzes |
| Sonstige Pflichten | Bis 10 Mio. EUR oder 2% des Jahresumsatzes |
| Falsche Informationen | Bis 5 Mio. EUR oder 1% des Jahresumsatzes |',
ARRAY['Art. 13 CRA', 'Art. 14 CRA', 'Annex I CRA', 'Annex II CRA', '(EU) 2024/2847'],
ARRAY['cra', 'cybersecurity', 'ce-kennzeichnung', 'iot', 'software', 'sbom', 'vulnerability', 'incident-reporting'],
'critical',
ARRAY['https://eur-lex.europa.eu/eli/reg/2024/2847/oj/eng'])
ON CONFLICT (id) DO NOTHING;
-- Artikel 2: CRA Security Controls (Annex I)
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('cra-security-controls', 'cra',
'CRA Annex I — 35 Essential Cybersecurity Requirements',
'Der CRA definiert in Annex I die wesentlichen Cybersicherheitsanforderungen. Daraus ergeben sich etwa 35 konkrete Security-Controls fuer den gesamten Produktlebenszyklus.',
'## Ueberblick
Annex I des CRA enthaelt die **Essential Cybersecurity Requirements**. Sie lassen sich in 7 Themenbereiche mit insgesamt etwa 35 konkreten Controls aufteilen.
## 1. Secure-by-Design / Architektur
| # | Control | Beschreibung |
|---|---------|-------------|
| 1 | Secure-by-default | Produkte mit sicheren Standardeinstellungen ausliefern |
| 2 | Minimale Angriffsflaeche | Nur notwendige Dienste und Schnittstellen aktivieren |
| 3 | Sichere Systemarchitektur | Sicherheitskritische Komponenten isolieren und schuetzen |
| 4 | Least-Privilege-Prinzip | Minimale Berechtigungen fuer Komponenten und Nutzer |
| 5 | Trennung kritischer Funktionen | Isolation sicherheitskritischer Funktionen |
| 6 | System-Haertung | Deaktivierung unnoetigerServices und Ports |
| 7 | Manipulationsschutz | Schutz vor unautorisierter Software-Aenderung |
| 8 | Integritaetspruefung | Signaturen und Integritaetschecks |
| 9 | Zugriffsschutz | Zugriffskontrollen implementieren |
## 2. Authentifizierung & Zugriffskontrolle
| # | Control | Beschreibung |
|---|---------|-------------|
| 10 | Starke Authentifizierung | Sichere Authentifizierungsmechanismen |
| 11 | Keine Default-Passwoerter | Keine universellen Standardpasswoerter |
| 12 | Credential-Management | Sichere Verwaltung von Zugangsdaten |
| 13 | Sitzungsmanagement | Sichere Session-Verwaltung |
| 14 | Brute-Force-Schutz | Schutz vor Brute-Force-Angriffen |
| 15 | Autorisierung | Rollenbasierte Zugriffskontrolle |
## 3. Kryptografie & Datenschutz
| # | Control | Beschreibung |
|---|---------|-------------|
| 16 | Datenverschluesselung | Verschluesselung sensibler Daten |
| 17 | Speicher-Schutz | Schutz gespeicherter Daten (at-rest) |
| 18 | Transport-Schutz | Schutz uebertragener Daten (in-transit) |
| 19 | Schluesselmanagement | Sicheres kryptografisches Schluesselmanagement |
| 20 | Schluesselschutz | Schutz kryptografischer Schluessel vor Zugriff |
## 4. Software-Lifecycle-Security
| # | Control | Beschreibung |
|---|---------|-------------|
| 21 | Secure Development Lifecycle | Strukturierter SSDLC-Prozess |
| 22 | Code Reviews | Systematische Code-Ueberpruefungen |
| 23 | Sichere Entwicklungspraktiken | Static Analysis, SAST, DAST |
| 24 | Supply-Chain-Security | Pruefung von Drittanbieter-Komponenten |
| 25 | Dependency-Monitoring | Ueberwachung von Abhaengigkeiten |
| 26 | SBOM | Software Bill of Materials fuehren |
## 5. Logging, Monitoring & Incident Detection
| # | Control | Beschreibung |
|---|---------|-------------|
| 27 | Security-Logging | Protokollierung sicherheitsrelevanter Ereignisse |
| 28 | Ereignis-Monitoring | Ueberwachung sicherheitsrelevanter Events |
| 29 | Anomalie-Erkennung | Erkennung von Angriffen oder Anomalien |
| 30 | Log-Integritaet | Schutz der Protokoll-Integritaet |
## 6. Update- und Patch-Management
| # | Control | Beschreibung |
|---|---------|-------------|
| 31 | Sichere Update-Mechanismen | Sichere Verfahren fuer Software-Updates |
| 32 | Update-Authentizitaet | Signaturen fuer Updates |
| 33 | Update-Integritaet | Integritaetspruefung bei Updates |
| 34 | Lifecycle-Support | Security-Updates waehrend des gesamten Lebenszyklus |
## 7. Vulnerability-Handling
| # | Control | Beschreibung |
|---|---------|-------------|
| 35 | Vulnerability-Management | Strukturierter Prozess fuer Schwachstellen-Behandlung |
Dazu gehoert:
- Koordinierte Offenlegung (Coordinated Vulnerability Disclosure)
- CVE-Monitoring
- Patch-Bereitstellung innerhalb angemessener Frist
## Automatisierungspotential
Diese 35 Controls koennen automatisch zu folgenden Dokumenten fuehren:
- **Cybersecurity Policy** (Grundsatzdokument)
- **Secure Development Policy** (SSDLC)
- **Vulnerability Management Policy** (CVD, Patching)
- **Incident Response Policy** (24h/72h Meldung)
- **SBOM-Dokumentation** (Komponentenliste)',
ARRAY['Annex I CRA', 'Art. 13 CRA', 'Art. 14 CRA', 'Art. 15 CRA'],
ARRAY['security-controls', 'annex-i', 'secure-by-design', 'authentifizierung', 'kryptografie', 'sbom', 'vulnerability', 'patching'],
'critical',
ARRAY['https://eur-lex.europa.eu/eli/reg/2024/2847/oj/eng'])
ON CONFLICT (id) DO NOTHING;
-- Artikel 3: CRA + NIS2 + AI Act Zusammenspiel
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('cra-regulierungsrahmen', 'cra',
'CRA + NIS2 + AI Act — Das neue EU-Security-Framework',
'CRA, NIS2-Richtlinie und AI Act bilden zusammen ein umfassendes EU-Sicherheitsframework fuer digitale Produkte, Infrastrukturen und KI-Systeme.',
'## Ueberblick
Die EU hat mit drei zentralen Rechtsakten ein zusammenhaengendes Framework fuer Cybersicherheit und KI-Regulierung geschaffen. Fuer Softwarehersteller, die KI einsetzen, sind alle drei relevant.
## Die drei Saeulen
| Verordnung | Fokus | Zielgruppe | Anwendung ab |
|-----------|-------|-----------|-------------|
| **CRA** (2024/2847) | Produkt-Cybersecurity | Hersteller von Hardware/Software | 12/2027 |
| **NIS2** (2022/2555) | Infrastruktur-Security | Betreiber wesentlicher Dienste | 10/2024 (national) |
| **AI Act** (2024/1689) | KI-Regulierung | Anbieter/Betreiber von KI-Systemen | 08/2025 (stufenweise) |
## Abgrenzung
### CRA vs. NIS2
- **CRA**: Regelt die **Sicherheit des Produkts** selbst (Design, Updates, Vulnerability Handling)
- **NIS2**: Regelt die **Sicherheit der Organisation** (Risikomanagement, Incident Response, Supply Chain)
- **Ueberschneidung**: Beide fordern Incident Reporting und Supply-Chain-Security
### CRA vs. AI Act
- **CRA**: Cybersecurity-Anforderungen an **alle** digitalen Produkte
- **AI Act**: Zusaetzliche Anforderungen fuer Produkte, die **KI enthalten** (Transparenz, Erklaerbarkeit, Risikobewertung)
- **Ueberschneidung**: Hochrisiko-KI-Systeme muessen sowohl CRA als auch AI Act erfuellen
## Synergien nutzen
Ein Unternehmen, das alle drei Verordnungen erfuellen muss, kann Synergien nutzen:
| Thema | CRA | NIS2 | AI Act |
|-------|-----|------|--------|
| Risikobewertung | Produkt-Risiko | Org-Risiko | KI-Risiko |
| Incident Reporting | 24h/72h | 24h/72h | Meldepflicht |
| Supply Chain | SBOM | Lieferantenpruefung | Drittanbieter-KI |
| Dokumentation | Tech. Doku | Policies | KI-Registrierung |
| Audit/Konformitaet | CE-Kennzeichnung | Zertifizierung | Konformitaetsbewertung |
## Empfehlung
Bauen Sie ein **integriertes Compliance-Management-System** auf, das alle drei Verordnungen abdeckt. Gemeinsame Policies (Security, Incident Response, Risk Management) koennen fuer alle drei Regelwerke genutzt werden.',
ARRAY['(EU) 2024/2847', '(EU) 2022/2555', '(EU) 2024/1689', 'Art. 13 CRA', 'Art. 21 NIS2', 'Art. 9 AI Act'],
ARRAY['cra', 'nis2', 'ai-act', 'security-framework', 'compliance', 'synergien', 'ce-kennzeichnung'],
'important',
ARRAY[]::text[])
ON CONFLICT (id) DO NOTHING;
@@ -0,0 +1,515 @@
-- Migration 056: CRA Cybersecurity Policy Template
-- Unternehmensrichtlinie Cybersecurity basierend auf EU Cyber Resilience Act, ISO 27001 Best Practices
INSERT INTO compliance_legal_templates (
id, tenant_id, document_type, title, description, content,
placeholders, language, jurisdiction,
license_id, license_name, source_name,
attribution_required, is_complete_document, version, status,
created_at, updated_at
) VALUES (
gen_random_uuid(),
'9282a473-5c95-4b3a-bf78-0ecc0ec71d3e',
'cybersecurity_policy',
'Unternehmensrichtlinie Cybersecurity (CRA-konform)',
'Umfassende Cybersecurity-Richtlinie basierend auf dem EU Cyber Resilience Act (EU) 2024/2847, ISO 27001 und Secure-Development-Standards. Deckt Governance, Risikomanagement, Secure Development, Vulnerability Management, Incident Response und Compliance ab.',
$template$# Unternehmensrichtlinie Cybersecurity
**{{COMPANY_NAME}}**
*(Cybersecurity Policy CRA-konform)*
| Feld | Inhalt |
|------|--------|
| Dokumenttyp | Unternehmensrichtlinie |
| Version | {{DOCUMENT_VERSION}} |
| Datum | {{VERSION_DATE}} |
| Naechste Ueberpruefung | {{NEXT_REVIEW_DATE}} |
| Verantwortlich | {{ISB_NAME}} (CISO/ISB) |
| Freigabe | {{GF_NAME}} (Geschaeftsfuehrung) |
| Vertraulichkeit | Intern |
---
## 1. Zweck der Richtlinie
Diese Cybersecurity-Richtlinie legt die organisatorischen und technischen Massnahmen fest, mit denen {{COMPANY_NAME}}:
- Informationssysteme schuetzt
- Cyberrisiken systematisch reduziert
- Gesetzliche Anforderungen erfuellt (insb. EU Cyber Resilience Act, NIS2, DSGVO)
- Sicherheitsvorfaelle erkennt, behandelt und meldet
Die Richtlinie gilt fuer alle:
- Mitarbeiterinnen und Mitarbeiter von {{COMPANY_NAME}}
- Externe Dienstleister und Auftragnehmer
- IT-Systeme, Software und Cloud-Services
- Produkte mit digitalen Elementen im Sinne des CRA
---
## 2. Geltungsbereich
Diese Richtlinie gilt fuer:
- Unternehmens-IT und Netzwerkinfrastruktur
- Interne Softwareentwicklung
- Cloud-Infrastruktur und SaaS-Dienste
- Datenverarbeitungssysteme
- Produkte mit digitalen Elementen (Software, IoT, Firmware)
- Lieferanten und Dienstleister mit Zugang zu Systemen von {{COMPANY_NAME}}
Betroffene Assets:
- Server und Netzwerkkomponenten
- Endgeraete (Laptops, Mobilgeraete)
- Software und Firmware
- Datenbanken und APIs
- Kryptografische Schluessel und Zertifikate
---
## 3. Sicherheitsziele
Die Cybersecurity-Strategie von {{COMPANY_NAME}} verfolgt folgende Ziele:
### Vertraulichkeit
Schutz sensibler Daten vor unbefugtem Zugriff. Klassifizierung von Daten nach Schutzbedarf.
### Integritaet
Sicherstellung, dass Daten und Systeme nicht unautorisiert veraendert werden. Einsatz von Integritaetspruefungen und Signaturen.
### Verfuegbarkeit
Systeme und Dienste muessen gemaess den vereinbarten SLAs verfuegbar sein. Redundanz und Wiederherstellungsfaehigkeit sicherstellen.
### Nachvollziehbarkeit
Sicherheitsrelevante Ereignisse muessen lueckenlos dokumentiert und fuer Audits nachvollziehbar sein.
---
## 4. Governance und Verantwortlichkeiten
### 4.1 Geschaeftsfuehrung
{{GF_NAME}} ist verantwortlich fuer:
- Festlegung der Sicherheitsstrategie
- Bereitstellung angemessener Ressourcen
- Ueberwachung der Compliance-Einhaltung
- Jaehrliche Freigabe dieser Richtlinie
### 4.2 Chief Information Security Officer (CISO/ISB)
{{ISB_NAME}} ist verantwortlich fuer:
- Umsetzung der Sicherheitsstrategie
- Risikomanagement und Risikoberichterstattung
- Security-Monitoring und Threat Intelligence
- Koordination des Incident-Response-Teams
- Kontaktperson fuer Behoerden bei Sicherheitsvorfaellen
### 4.3 Datenschutzbeauftragter
{{DPO_NAME}} ({{DPO_EMAIL}}) wird bei sicherheitsrelevanten Vorfaellen einbezogen, die personenbezogene Daten betreffen.
### 4.4 IT-Abteilung
Verantwortlich fuer:
- Sichere Infrastruktur und Systemhaertung
- Patch-Management und Update-Bereitstellung
- Netzwerksegmentierung und Firewall-Management
- Monitoring und Log-Management
### 4.5 Entwicklerteams
Verantwortlich fuer:
- Secure Coding und Code Reviews
- Dependency Management und SBOM-Pflege
- Security Testing (SAST, DAST, SCA)
- Vulnerability Remediation
### 4.6 Alle Mitarbeiter
Alle Mitarbeiter von {{COMPANY_NAME}} muessen:
- Sicherheitsrichtlinien einhalten
- Sicherheitsvorfaelle unverzueglich melden
- An jaehrlichen Security-Schulungen teilnehmen
- Phishing-Versuche und verdaechtige Aktivitaeten melden
---
## 5. Risikomanagement
{{COMPANY_NAME}} fuehrt regelmaessig eine Cyber-Risikoanalyse durch.
### Prozess
1. **Identifikation** kritischer Assets und Daten
2. **Bedrohungsanalyse** (Threat Modeling, STRIDE)
3. **Schwachstellenanalyse** (CVE-Monitoring, Vulnerability Scanning)
4. **Risikobewertung** (Eintrittswahrscheinlichkeit x Auswirkung)
5. **Risikobehandlung** (Vermeiden, Reduzieren, Uebertragen, Akzeptieren)
### Frequenz
Risikobewertungen erfolgen:
- Mindestens jaehrlich
- Bei wesentlichen Systemanderungen
- Bei neuen Produkten oder Dienstleistungen
- Nach Sicherheitsvorfaellen
### Dokumentation
Alle Risikoanalysen werden dokumentiert und fuer mindestens 3 Jahre aufbewahrt. Die Ergebnisse werden der Geschaeftsfuehrung in Form eines Risikoberichts vorgelegt.
---
## 6. Secure System Architecture
Systeme von {{COMPANY_NAME}} muessen nach folgenden Prinzipien entwickelt und betrieben werden:
### Security by Design
Sicherheitsanforderungen werden bereits in der Architekturphase beruecksichtigt. Jedes neue System durchlaeuft ein Security Architecture Review.
### Security by Default
Systeme werden mit sicheren Grundeinstellungen ausgeliefert. Keine Dienste oder Ports sind standardmaessig aktiviert, die nicht benoetigt werden.
### Least Privilege
Benutzer und Systeme erhalten nur die minimal notwendigen Berechtigungen. Privilegierte Zugriffe werden gesondert protokolliert.
### Segmentierung
Kritische Systeme werden durch Netzwerksegmentierung isoliert. Produktiv-, Entwicklungs- und Testumgebungen sind strikt getrennt.
### Haertung
Alle Systeme werden gemaess anerkannter Haertungsrichtlinien (CIS Benchmarks, BSI IT-Grundschutz) konfiguriert.
---
## 7. Zugriffskontrollen
### Anforderungen
- Eindeutige, personalisierte Benutzerkonten
- Starke Passwortrichtlinie (mind. 12 Zeichen, Komplexitaet)
- Multi-Faktor-Authentifizierung (MFA) fuer alle administrativen Zugriffe und externe Zugaenge
- Rollenbasierte Zugriffskontrolle (RBAC) mit regelmaessiger Rezertifizierung
- Automatische Sperrung nach 5 fehlgeschlagenen Login-Versuchen
### Verboten
- Gemeinsam genutzte Accounts (Shared Accounts)
- Universal-Default-Passwoerter
- Unverschluesselte Speicherung von Zugangsdaten
- Weitergabe von Zugangsdaten per E-Mail
### Privileged Access Management
Administratorzugriffe muessen:
- Gesondert beantragt und genehmigt werden
- Zeitlich begrenzt sein (Just-in-Time Access)
- Vollstaendig protokolliert werden
---
## 8. Kryptografie
{{COMPANY_NAME}} verwendet ausschliesslich moderne, anerkannte kryptografische Verfahren.
### Verschluesselung erforderlich fuer
- Gespeicherte sensible Daten (at rest) AES-256
- Datenuebertraung (in transit) TLS 1.2+, vorzugsweise TLS 1.3
- Backups vollstaendig verschluesselt
- Konfigurationsdaten und Secrets Vault oder vergleichbar
### Schluesselmanagement
- Schluessel muessen sicher gespeichert werden (HSM oder Vault)
- Regelmaessige Rotation (mind. jaehrlich, bei Kompromittierung sofort)
- Zugriff nur fuer autorisierte Personen
- Dokumentation der Schluessel-Lebenszyklen
### Verbotene Verfahren
- MD5 und SHA-1 fuer kryptografische Zwecke
- DES und 3DES
- SSL und TLS < 1.2
---
## 9. Secure Software Development Lifecycle (SSDLC)
Alle Softwareprodukte von {{COMPANY_NAME}} muessen einen sicheren Entwicklungsprozess durchlaufen. Dies entspricht den Anforderungen des CRA Annex I.
### Entwicklungsprozess
1. **Security Requirements** Sicherheitsanforderungen in User Stories und Epics
2. **Threat Modeling** Bedrohungsanalyse in der Designphase
3. **Secure Coding** Einhaltung von Secure-Coding-Standards
4. **Code Review** Peer Review mit Security-Fokus
5. **Security Testing** Automatisierte und manuelle Tests
6. **Release-Freigabe** Security Sign-off vor Deployment
### Pflichtmassnahmen
- **Static Application Security Testing (SAST)** in der CI/CD-Pipeline
- **Software Composition Analysis (SCA)** Dependency Scanning
- **Dynamic Application Security Testing (DAST)** vor jedem Major Release
- **Secrets Detection** Automatische Pruefung auf eingebettete Zugangsdaten
- **Penetration Testing** mindestens jaehrlich durch externe Tester
---
## 10. Software-Supply-Chain-Security
{{COMPANY_NAME}} kontrolliert externe Softwarekomponenten systematisch.
### Software Bill of Materials (SBOM)
Fuer alle Produkte wird ein SBOM gefuehrt, das mindestens folgende Informationen enthaelt:
- Name und Version aller Software-Komponenten
- Lizenzinformationen
- Bekannte Schwachstellen (CVE)
Das SBOM wird bei jedem Release aktualisiert und in maschinenlesbarem Format (CycloneDX oder SPDX) bereitgestellt.
### Open-Source-Kontrolle
- Lizenzpruefung vor Aufnahme neuer Abhaengigkeiten
- Monitoring auf bekannte Schwachstellen (CVE)
- Regelmaessige Updates von Abhaengigkeiten
---
## 11. Logging und Monitoring
### Logging umfasst
- Erfolgreiche und fehlgeschlagene Login-Versuche
- Administrative Systemanderungen
- Zugriffe auf sensible Daten
- Sicherheitsrelevante Konfigurationsanderungen
- API-Zugriffe und Fehler
### Anforderungen an Logs
- Manipulationssicher (append-only, signiert oder WORM)
- Zentral gesammelt (SIEM oder vergleichbar)
- Aufbewahrung mindestens 12 Monate
- Zugriff nur fuer autorisiertes Security-Personal
### Monitoring
- Echtzeit-Ueberwachung sicherheitsrelevanter Ereignisse
- Automatische Alarmierung bei Anomalien
- Korrelation von Events aus verschiedenen Quellen
---
## 12. Vulnerability Management
{{COMPANY_NAME}} betreibt ein strukturiertes Schwachstellenmanagement.
### Prozess
1. **Identifikation** Automatische Scans, Bug Bounty, CVE-Monitoring
2. **Bewertung** Risikobewertung nach CVSS
3. **Priorisierung** Kritische Schwachstellen zuerst
4. **Behebung** Patch-Entwicklung und Deployment
5. **Verifizierung** Bestaetigung der Behebung
6. **Kommunikation** Information betroffener Kunden und Behoerden
### Coordinated Vulnerability Disclosure (CVD)
{{COMPANY_NAME}} veroeffentlicht eine CVD-Policy. Sicherheitsforscher koennen Schwachstellen an {{SECURITY_EMAIL}} melden. Meldungen werden innerhalb von 5 Werktagen bestaetigt.
---
## 13. Patch- und Update-Management
Alle Systeme muessen regelmaessig aktualisiert werden.
### Patchzyklen
| Risikostufe | Reaktionszeit |
|-------------|---------------|
| Kritisch (CVSS >= 9.0) | 24-72 Stunden |
| Hoch (CVSS 7.0-8.9) | 7 Tage |
| Mittel (CVSS 4.0-6.9) | 30 Tage |
| Niedrig (CVSS < 4.0) | Naechster regulaerer Update-Zyklus |
### Anforderungen an Updates
- Alle Updates muessen **digital signiert** sein
- Integritaetspruefung vor Installation
- Rollback-Moeglichkeit bei fehlerhaften Updates
- Automatische Update-Benachrichtigung fuer Kunden
- **Mindest-Support-Zeitraum: 5 Jahre** (gemaess CRA)
---
## 14. Incident Response
{{COMPANY_NAME}} betreibt einen dokumentierten Incident-Response-Prozess.
### Schritte
1. **Detection** Erkennung durch Monitoring, Meldung oder externe Information
2. **Classification** Einstufung nach Schweregrad (P1-P4)
3. **Containment** Sofortige Eindaemmung des Vorfalls
4. **Investigation** Forensische Analyse und Ursachenermittlung
5. **Recovery** Wiederherstellung des Normalbetriebs
6. **Reporting** Dokumentation und Meldung an Behoerden
7. **Lessons Learned** Nachbereitung und Verbesserung
### Meldepflichten (CRA-konform)
| Meldung | Frist | Empfaenger |
|---------|-------|-----------|
| **Fruehwarnung** | 24 Stunden | ENISA / nationale Behoerde |
| **Detaillierter Bericht** | 72 Stunden | ENISA / nationale Behoerde |
| **Abschlussbericht** | 1 Monat | ENISA / nationale Behoerde |
Bei personenbezogenen Daten gelten zusaetzlich die Fristen nach Art. 33/34 DSGVO (72 Stunden an Aufsichtsbehoerde).
### Kontakte
| Rolle | Person | Kontakt |
|-------|--------|---------|
| CISO/ISB | {{ISB_NAME}} | {{ISB_EMAIL}} |
| DSB | {{DPO_NAME}} | {{DPO_EMAIL}} |
| GF | {{GF_NAME}} | {{GF_EMAIL}} |
---
## 15. Security Testing
Folgende Tests werden regelmaessig durchgefuehrt:
| Test | Frequenz | Durchfuehrung |
|------|----------|--------------|
| Vulnerability Scans | Woechentlich | Automatisiert (CI/CD) |
| SAST/SCA | Bei jedem Commit | Automatisiert (CI/CD) |
| DAST | Vor Major Releases | Automatisiert + manuell |
| Penetration Tests | Jaehrlich | Externer Dienstleister |
| Red-Team-Tests | Alle 2 Jahre | Externer Dienstleister |
| Social Engineering | Jaehrlich | Externer Dienstleister |
---
## 16. Backup und Wiederherstellung
### Anforderungen
- **Taegliche Backups** aller kritischen Systeme und Daten
- **Off-Site-Backups** an geografisch getrenntem Standort
- **Verschluesselung** aller Backup-Daten
- **Wiederherstellungstests** mindestens vierteljaehrlich
### Recovery-Ziele
| Metrik | Ziel |
|--------|------|
| Recovery Time Objective (RTO) | {{RTO_HOURS}} Stunden |
| Recovery Point Objective (RPO) | {{RPO_HOURS}} Stunden |
---
## 17. Lieferanten- und Drittanbieter-Management
Lieferanten mit Zugang zu Systemen oder Daten von {{COMPANY_NAME}} muessen Sicherheitsanforderungen erfuellen.
### Anforderungen
- Sicherheitspruefung vor Vertragsabschluss (Security Assessment)
- Sicherheitsanforderungen im Vertrag (Auftragsverarbeitung, SLA)
- Regelmaessige Audits und Compliance-Nachweise
- Incident-Notification-Pflicht innerhalb von 24 Stunden
- Nachweis ueber eigenes Vulnerability Management
---
## 18. Schulungen und Awareness
Alle Mitarbeiter von {{COMPANY_NAME}} erhalten:
- **Jaehrliche Security-Awareness-Trainings**
- **Phishing-Simulationen** (mind. 2x jaehrlich)
- **Rollenspezifische Schulungen** (Entwickler: Secure Coding, IT: Incident Response)
- **Onboarding-Schulung** fuer neue Mitarbeiter
Teilnahme ist verpflichtend. Die Teilnahme wird dokumentiert.
---
## 19. Dokumentation und Compliance
{{COMPANY_NAME}} dokumentiert:
- Risikoanalysen und Risikobehandlungsplaene
- Sicherheitskontrollen und deren Wirksamkeit
- Sicherheitsvorfaelle und deren Behandlung
- Software-Updates und Patches
- SBOM fuer alle Produkte
- Audit-Ergebnisse
Die Dokumentation muss jederzeit fuer Audits und behoerdliche Anfragen verfuegbar sein.
### Regulatorische Compliance
Diese Richtlinie dient der Einhaltung folgender Vorschriften:
- **EU Cyber Resilience Act** (EU) 2024/2847
- **NIS2-Richtlinie** (EU) 2022/2555
- **DSGVO** (EU) 2016/679 technische und organisatorische Massnahmen
- **ISO/IEC 27001** Best Practices fuer Informationssicherheit
---
## 20. Durchsetzung
Verstoesse gegen diese Richtlinie koennen je nach Schwere folgende Konsequenzen haben:
- Disziplinarmassnahmen
- Vertragsstrafen (bei externen Dienstleistern)
- Rechtliche Konsequenzen (bei vorsaetzlichen Verstoessen)
---
## 21. Ueberpruefung und Aktualisierung
Diese Cybersecurity-Richtlinie wird ueberprueft:
- **Jaehrlich** durch {{ISB_NAME}} (CISO/ISB)
- Bei **regulatorischen Aenderungen** (neue EU-Verordnungen, nationale Gesetze)
- Nach **groesseren Sicherheitsvorfaellen**
- Bei **wesentlichen Aenderungen** der IT-Infrastruktur oder Produktlandschaft
Die naechste planmaessige Ueberpruefung ist am **{{NEXT_REVIEW_DATE}}**.
---
## Freigabe
| | Name | Datum | Unterschrift |
|--|------|-------|-------------|
| Erstellt von | {{ISB_NAME}} (CISO/ISB) | {{VERSION_DATE}} | _________________ |
| Freigegeben von | {{GF_NAME}} (Geschaeftsfuehrung) | {{VERSION_DATE}} | _________________ |
---
*Dieses Dokument ist Eigentum von {{COMPANY_NAME}} und unterliegt der Vertraulichkeitsstufe "Intern".*
$template$,
CAST('["COMPANY_NAME","COMPANY_ADDRESS","COMPANY_CITY","GF_NAME","GF_EMAIL","ISB_NAME","ISB_EMAIL","DPO_NAME","DPO_EMAIL","SECURITY_EMAIL","DOCUMENT_VERSION","VERSION_DATE","NEXT_REVIEW_DATE","RTO_HOURS","RPO_HOURS"]' AS jsonb),
'de', 'DE',
'mit', 'MIT License', 'BreakPilot Compliance',
false, true, '1.0.0', 'published',
NOW(), NOW()
) ON CONFLICT DO NOTHING;
@@ -0,0 +1,23 @@
-- 057: Add batch processing paths to canonical_processed_chunks
-- New values: structured_batch, llm_reform_batch (used by batch control generation)
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'canonical_processed_chunks') THEN
ALTER TABLE canonical_processed_chunks
DROP CONSTRAINT IF EXISTS canonical_processed_chunks_processing_path_check;
ALTER TABLE canonical_processed_chunks
ADD CONSTRAINT canonical_processed_chunks_processing_path_check
CHECK (processing_path IN (
'structured',
'llm_reform',
'skipped',
'prefilter_skip',
'no_control',
'store_failed',
'error',
'structured_batch',
'llm_reform_batch'
));
END IF;
END $$;
@@ -0,0 +1,8 @@
-- Migration 058: Add generation_strategy column to canonical_controls
-- Tracks whether a control was generated with document-grouped or ungrouped batching
ALTER TABLE canonical_controls
ADD COLUMN IF NOT EXISTS generation_strategy TEXT NOT NULL DEFAULT 'ungrouped';
COMMENT ON COLUMN canonical_controls.generation_strategy IS
'How chunks were batched during generation: ungrouped (random), document_grouped (by regulation+article)';
@@ -0,0 +1,292 @@
-- Migration 059: CRA Annex I — Detaillierte Essential Cybersecurity Requirements
-- Erweitert den bestehenden Wiki-Artikel 'cra-security-controls' um Part 1 + Part 2,
-- Produktklassifizierung und ISO 27001 Mapping.
-- Zusaetzlich: Neuer Artikel fuer CRA-Produktklassifizierung und Konformitaetsbewertung.
-- ============================================================================
-- 1) Update: CRA Security Controls (Annex I) — Vollstaendige 8-Kategorien-Struktur
-- ============================================================================
UPDATE compliance_wiki_articles
SET
title = 'CRA Annex I — Essential Cybersecurity Requirements (Vollstaendig)',
summary = 'Annex I des CRA definiert die wesentlichen Cybersicherheitsanforderungen in zwei Teilen: Teil 1 (Produktsicherheit, 11 Anforderungen) und Teil 2 (Schwachstellenbehandlung, 8 Anforderungen). Daraus ergeben sich rund 35 konkrete Security-Controls in 8 Kategorien.',
content = '## Ueberblick
Der **EU Cyber Resilience Act (CRA)**, Verordnung (EU) 2024/2847, legt in **Annex I** die **Essential Cybersecurity Requirements** fest, die alle Produkte mit digitalen Elementen erfuellen muessen. Annex I besteht aus zwei Teilen:
- **Teil 1 Sicherheitsanforderungen an Produkte** (11 Kernanforderungen)
- **Teil 2 Anforderungen an die Schwachstellenbehandlung** (8 Prozessanforderungen)
Daraus lassen sich etwa **35 konkrete Security-Controls** in **8 thematischen Kategorien** ableiten. Diese Controls bilden die Grundlage fuer eine Cybersecurity-Compliance-Strategie.
---
## Teil 1: Sicherheitsanforderungen an Produkte
### Kategorie 1 Secure-by-Design und Architektur
Diese Controls stellen sicher, dass Sicherheit von Anfang an in die Produktarchitektur integriert wird.
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|---|---------|-------------|-------------|-------------------|
| 1 | **Secure-by-Default-Konfiguration** | Annex I, 1(1) | Produkte muessen mit sicheren Standardeinstellungen ausgeliefert werden. Keine offenen Ports, keine aktivierten Debug-Schnittstellen, keine unnoetig laufenden Dienste. | A.8.9 |
| 2 | **Minimale Angriffsflaeche** | Annex I, 1(2) | Nur notwendige Schnittstellen, Dienste und Protokolle aktivieren. Jede zusaetzliche Funktionalitaet vergroessert die Angriffsflaeche und muss einzeln gerechtfertigt werden. | A.8.9, A.8.20 |
| 3 | **Sichere Systemarchitektur** | Annex I, 1(3) | Sicherheitskritische Komponenten muessen isoliert werden (Sandboxing, Containerisierung, Privilege Separation). Defense-in-Depth-Prinzip anwenden. | A.8.27 |
| 4 | **Least-Privilege-Prinzip** | Annex I, 1(3)(d) | Jede Komponente, jeder Prozess und jeder Benutzer erhaelt nur die minimal notwendigen Berechtigungen. Privilegien-Eskalation muss verhindert werden. | A.8.2, A.8.3 |
| 5 | **Manipulationsschutz** | Annex I, 1(3)(c) | Schutz vor unautorisierter Aenderung von Software und Konfiguration durch Integritaetsmechanismen (Code Signing, Secure Boot, TPM). | A.8.24 |
| 6 | **Integritaetspruefung** | Annex I, 1(3)(c) | Automatische Ueberpruefung der Integritaet von Software, Firmware und Konfigurationsdaten bei Start und Laufzeit. Hash-basierte Validierung und digitale Signaturen. | A.8.24 |
### Kategorie 2 Authentifizierung und Zugriffskontrolle
Controls zur Sicherstellung, dass nur autorisierte Personen und Systeme Zugriff erhalten.
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|---|---------|-------------|-------------|-------------------|
| 7 | **Starke Authentifizierung** | Annex I, 1(3)(d) | Implementierung sicherer Authentifizierungsmechanismen. Multi-Faktor-Authentifizierung fuer administrative Zugriffe. Unterstuetzung moderner Standards (FIDO2, WebAuthn). | A.8.5 |
| 8 | **Keine Default-Passwoerter** | Annex I, 1(3)(d) | Produkte duerfen keine universellen Standardpasswoerter verwenden. Jedes Geraet muss ein individuelles Passwort erhalten oder den Benutzer zur Aenderung bei Ersteinrichtung zwingen. | A.8.5 |
| 9 | **Sicheres Credential-Management** | Annex I, 1(3)(d) | Zugangsdaten muessen verschluesselt gespeichert werden (bcrypt, Argon2id). Keine Klartextspeicherung. API-Keys und Tokens regelmaessig rotieren. | A.8.5 |
| 10 | **Sitzungsmanagement** | Annex I, 1(3)(d) | Sichere Session-Verwaltung mit Timeout, Token-Binding und Session-Invalidierung bei Logout oder Passwortwechsel. CSRF-Schutz implementieren. | A.8.5 |
| 11 | **Brute-Force-Schutz** | Annex I, 1(3)(d) | Schutz vor Brute-Force- und Credential-Stuffing-Angriffen durch Rate Limiting, Account Lockout und CAPTCHA-Mechanismen. | A.8.5, A.8.16 |
| 12 | **Rollenbasierte Autorisierung** | Annex I, 1(3)(d) | Implementierung von RBAC (Role-Based Access Control). Trennung von administrativen und Nutzerfunktionen. Prinzip der geringsten Privilegien durchsetzen. | A.8.2, A.8.3 |
### Kategorie 3 Kryptografie und Datenschutz
Controls zum Schutz von Daten durch kryptografische Verfahren.
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|---|---------|-------------|-------------|-------------------|
| 13 | **Verschluesselung sensibler Daten** | Annex I, 1(3)(e) | Alle sensiblen Daten muessen verschluesselt werden sowohl bei der Speicherung (at rest, AES-256) als auch bei der Uebertragung (in transit, TLS 1.2+). | A.8.24 |
| 14 | **Speicher-Schutz (Data at Rest)** | Annex I, 1(3)(e) | Verschluesselung gespeicherter Daten auf Festplatten, in Datenbanken und Backups. Schluessel getrennt von Daten speichern. | A.8.24 |
| 15 | **Transport-Schutz (Data in Transit)** | Annex I, 1(3)(e) | Alle Netzwerkkommunikation ueber TLS 1.2 oder hoeher. Veraltete Protokolle (SSL, TLS 1.0/1.1) deaktivieren. Certificate Pinning fuer kritische Verbindungen. | A.8.24 |
| 16 | **Sicheres Schluesselmanagement** | Annex I, 1(3)(e) | Kryptografische Schluessel in HSM oder Vault speichern. Regelmaessige Rotation (mind. jaehrlich). Dokumentation der Schluessel-Lebenszyklen. Sofortige Rotation bei Kompromittierungsverdacht. | A.8.24 |
| 17 | **Datenminimierung** | Annex I, 1(3)(f) | Nur Daten erfassen und verarbeiten, die fuer die Produktfunktion erforderlich sind. Personenbezogene Daten gemaess DSGVO-Grundsaetzen behandeln. | A.8.10, A.8.11 |
### Kategorie 4 Secure Software Development Lifecycle
Controls fuer sichere Softwareentwicklung ueber den gesamten Lebenszyklus.
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|---|---------|-------------|-------------|-------------------|
| 18 | **Strukturierter SSDLC** | Annex I, 1(1) | Implementierung eines formalen Secure Software Development Lifecycle mit definierten Security Gates in jeder Phase (Requirements, Design, Implementation, Test, Release). | A.8.25, A.8.26 |
| 19 | **Systematische Code Reviews** | Annex I, 1(1) | Peer Reviews mit Security-Fokus fuer jeden Code-Commit. Einsatz von Checklisten fuer OWASP Top 10 und CWE Top 25. Security Champions in jedem Entwicklerteam. | A.8.25 |
| 20 | **Automatisierte Sicherheitstests** | Annex I, 1(1) | Static Application Security Testing (SAST), Dynamic Application Security Testing (DAST) und Software Composition Analysis (SCA) in der CI/CD-Pipeline. Secrets Detection fuer eingebettete Zugangsdaten. | A.8.25 |
| 21 | **Supply-Chain-Security** | Annex I, 1(5) | Systematische Pruefung aller Drittanbieter-Komponenten auf Schwachstellen und Lizenz-Compliance. Vertrauenswuerdigkeit von Lieferanten bewerten. | A.5.19, A.5.21 |
| 22 | **Dependency-Monitoring** | Annex I, 1(5) | Kontinuierliche Ueberwachung aller Abhaengigkeiten auf bekannte Schwachstellen (CVE). Automatische Benachrichtigung bei neuen CVEs in verwendeten Bibliotheken. | A.8.8, A.8.25 |
| 23 | **Software Bill of Materials (SBOM)** | Annex I, 1(5) | Fuer jedes Produkt ein maschinenlesbares SBOM fuehren (CycloneDX oder SPDX). Mindestens Top-Level-Abhaengigkeiten mit Name, Version und Lizenz dokumentieren. SBOM bei jedem Release aktualisieren. | A.8.25 |
### Kategorie 5 Logging, Monitoring und Anomalie-Erkennung
Controls zur Erkennung und Nachverfolgung von Sicherheitsereignissen.
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|---|---------|-------------|-------------|-------------------|
| 24 | **Security-Logging** | Annex I, 1(3)(g) | Protokollierung aller sicherheitsrelevanten Ereignisse: Login-Versuche, Berechtigungsaenderungen, administrative Aktionen, API-Zugriffe, Fehler und Ausnahmen. Logs muessen Zeitstempel, Akteur, Aktion und Ergebnis enthalten. | A.8.15 |
| 25 | **Ereignis-Monitoring** | Annex I, 1(3)(g) | Zentrale Sammlung und Echtzeit-Ueberwachung sicherheitsrelevanter Events. Einsatz eines SIEM-Systems oder vergleichbarer Loesung. Korrelation von Events aus verschiedenen Quellen. | A.8.16 |
| 26 | **Anomalie-Erkennung** | Annex I, 1(3)(g) | Automatische Erkennung von Angriffsmustern und ungewoehnlichem Verhalten. Alarmierung bei Abweichungen von Baseline-Verhalten. Integration von Threat Intelligence Feeds. | A.8.16 |
| 27 | **Log-Integritaet und -Aufbewahrung** | Annex I, 1(3)(g) | Logs muessen manipulationssicher gespeichert werden (append-only, signiert oder WORM). Aufbewahrung mindestens 12 Monate. Zugriff auf Logs nur fuer autorisiertes Security-Personal. | A.8.15 |
### Kategorie 6 Update- und Patch-Management
Controls fuer die sichere Bereitstellung und Installation von Updates.
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|---|---------|-------------|-------------|-------------------|
| 28 | **Sichere Update-Mechanismen** | Annex I, 1(4) | Updates muessen ueber sichere Kanaele verteilt werden (HTTPS, signierte Pakete). Automatische oder einfach zugaengliche Update-Moeglichkeit fuer Endnutzer. Rollback-Faehigkeit bei fehlerhaften Updates. | A.8.8, A.8.19 |
| 29 | **Update-Authentizitaet** | Annex I, 1(4) | Alle Updates muessen digital signiert sein. Signaturpruefung vor Installation erzwingen. Verwendung vertrauenswuerdiger Signaturschluessel mit dokumentierter Key Ceremony. | A.8.24 |
| 30 | **Update-Integritaet** | Annex I, 1(4) | Integritaetspruefung jedes Update-Pakets vor und nach Installation (Hash-Vergleich, Signatur-Verifikation). Manipulation waehrend der Uebertragung erkennen und ablehnen. | A.8.24 |
| 31 | **Lifecycle-Support** | Annex I, 1(4) | Security-Updates waehrend des gesamten erwarteten Produktlebenszyklus bereitstellen mindestens **5 Jahre** ab Inverkehrbringen oder die erwartete Nutzungsdauer, je nachdem welcher Zeitraum laenger ist. End-of-Life klar kommunizieren. | A.8.8 |
---
## Teil 2: Anforderungen an die Schwachstellenbehandlung
### Kategorie 7 Vulnerability Management
Controls fuer die systematische Identifikation, Bewertung und Behebung von Schwachstellen.
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|---|---------|-------------|-------------|-------------------|
| 32 | **Schwachstellen-Identifikation** | Annex I, 2(1) | Kontinuierliches CVE-Monitoring aller eingesetzten Komponenten. Regelmaessige Vulnerability Scans (woechentlich automatisiert). Bug-Bounty-Programme oder Responsible-Disclosure-Kanaele einrichten. | A.8.8 |
| 33 | **SBOM-Pflege und Analyse** | Annex I, 2(1) | SBOM aktuell halten und kontinuierlich gegen CVE-Datenbanken pruefen. Automatische Alarmierung bei neu entdeckten Schwachstellen in verwendeten Komponenten. | A.8.8, A.8.25 |
| 34 | **Risikobasierte Priorisierung** | Annex I, 2(2) | Schwachstellen nach CVSS-Score und tatsaechlichem Risiko priorisieren. Reaktionszeiten nach Schweregrad: Kritisch (2472h), Hoch (7 Tage), Mittel (30 Tage), Niedrig (naechster Zyklus). | A.8.8 |
| 35 | **Coordinated Vulnerability Disclosure** | Annex I, 2(5) | Veroeffentlichung einer CVD-Policy mit klarem Meldeprozess. Kontaktadresse fuer Sicherheitsforscher bereitstellen. Eingangsbestaetigung innerhalb von 5 Werktagen. Koordinierte Veroeffentlichung nach Patch-Verfuegbarkeit. | A.5.5, A.5.6 |
### Kategorie 8 Incident Response und Meldepflichten
Controls fuer die Erkennung, Behandlung und Meldung von Sicherheitsvorfaellen.
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|---|---------|-------------|-------------|-------------------|
| 36 | **Incident-Response-Prozess** | Annex I, 2(5) | Dokumentierter Prozess mit definierten Phasen: Detection Classification Containment Investigation Recovery Reporting Lessons Learned. Regelmaessige Uebungen (Tabletop Exercises). | A.5.24, A.5.25, A.5.26 |
| 37 | **Fruehwarnung (24h)** | Annex I, 2(7) + Art. 14(2)(a) | Bei aktiv ausgenutzten Schwachstellen oder schweren Vorfaellen: Fruehwarnung an ENISA und/oder zustaendige nationale Behoerde innerhalb von **24 Stunden** nach Kenntniserlangung. | A.5.24, A.5.26 |
| 38 | **Detaillierter Vorfallsbericht (72h)** | Annex I, 2(7) + Art. 14(2)(b) | Innerhalb von **72 Stunden**: Detaillierter Bericht mit Umfang, Auswirkung, Ursachenanalyse und eingeleiteten Gegenmassnahmen. Bei personenbezogenen Daten zusaetzlich Art. 33/34 DSGVO beachten. | A.5.24, A.5.26 |
| 39 | **Patch-Bereitstellung** | Annex I, 2(3) | Patches fuer gemeldete und bestaetigte Schwachstellen so schnell wie moeglich bereitstellen. Sicherheitshinweise (Security Advisories) an Kunden veroeffentlichen. CSAF-Format fuer maschinenlesbare Advisories empfohlen. | A.8.8 |
| 40 | **Dokumentation und Nachbereitung** | Annex I, 2(6) | Alle Schwachstellen und Vorfaelle lueckenlos dokumentieren und fuer mindestens 10 Jahre aufbewahren. Lessons-Learned-Prozess nach jedem bedeutenden Vorfall. Ergebnisse in Risikobewertung einfliessen lassen. | A.5.27 |
---
## Produktklassifizierung nach CRA
Der CRA unterscheidet drei Produktkategorien mit unterschiedlichen Konformitaetsanforderungen:
### Standardprodukte (Default)
**Beispiele:** einfache Apps, Desktop-Software, Spiele, Foto-Editoren
- **Konformitaetsbewertung:** Selbstbewertung (Modul A)
- **Anforderungen:** Alle Annex-I-Anforderungen, aber einfachster Nachweis
- **Betrifft:** ca. 90% aller Produkte
### Wichtige Produkte (Annex III) Klasse I
**Beispiele:** Passwort-Manager, VPN-Software, Firewalls, Router, Smart-Home-Systeme, IoT-Geraete mit Sensorfunktion, SIEM-Systeme
- **Konformitaetsbewertung:** Harmonisierte Standards oder Drittanbieter-Bewertung
- **Anforderungen:** Alle Annex-I-Anforderungen + erhoehte Nachweispflichten
- **Betrifft:** ca. 8% aller Produkte
### Wichtige Produkte Klasse II
**Beispiele:** Betriebssysteme, Hypervisoren, Container-Runtimes, Public-Key-Infrastruktur, industrielle Steuerungssysteme (ICS/SCADA)
- **Konformitaetsbewertung:** Verpflichtende Drittanbieter-Bewertung durch benannte Stelle
- **Anforderungen:** Alle Annex-I-Anforderungen + strengste Nachweispflichten
- **Betrifft:** ca. 2% aller Produkte
### Kritische Produkte (Annex IV)
**Beispiele:** Hardware-Security-Module (HSM), Smartcard-Chips, Secure Elements, Smart-Meter-Gateways
- **Konformitaetsbewertung:** Europaeisches Cybersicherheitszertifikat erforderlich (EUCC)
- **Anforderungen:** Hoechste Stufe europaeische Zertifizierung obligatorisch
---
## Zuordnung der Controls zu Dokumenten
Diese 40 Controls koennen automatisiert zu folgenden Compliance-Dokumenten fuehren:
| Dokument | Controls | Beschreibung |
|----------|----------|-------------|
| **Cybersecurity Policy** | 140 | Uebergreifendes Grundsatzdokument fuer Cybersicherheit |
| **Secure Development Policy** | 1823 | Richtlinie fuer den sicheren Entwicklungsprozess (SSDLC) |
| **Vulnerability Management Policy** | 3235, 39 | CVD, Patching, SBOM-Analyse |
| **Incident Response Plan** | 3638, 40 | 24h/72h Meldung, Eskalation, Nachbereitung |
| **Access Control Policy** | 712 | Authentifizierung, Autorisierung, Passwort-Richtlinie |
| **Cryptographic Policy** | 1317 | Verschluesselung, Schluesselmanagement, Datenschutz |
| **Update/Patch Policy** | 2831 | Update-Mechanismen, Signierung, Lifecycle-Support |
| **Logging & Monitoring Policy** | 2427 | Security-Logging, SIEM, Anomalie-Erkennung |
---
## Zeitplan fuer die Umsetzung
| Datum | Meilenstein |
|-------|------------|
| 10.12.2024 | CRA in Kraft getreten |
| 11.06.2026 | Konformitaetsbewertungsstellen muessen benannt sein |
| 11.09.2026 | **Meldepflichten aktiv** (Controls 37, 38) |
| 11.12.2027 | **Volle Anwendung** alle 40 Controls muessen umgesetzt sein, CE-Kennzeichnung erforderlich |
---
## Sanktionen bei Nicht-Einhaltung
| Verstoss | Maximales Bussgeld |
|----------|-------------------|
| Wesentliche Anforderungen (Annex I) | 15 Mio. EUR oder 2,5% des weltweiten Jahresumsatzes |
| Sonstige Pflichten | 10 Mio. EUR oder 2% des weltweiten Jahresumsatzes |
| Falsche/unvollstaendige Informationen | 5 Mio. EUR oder 1% des weltweiten Jahresumsatzes |',
legal_refs = ARRAY['Annex I CRA', 'Annex III CRA', 'Annex IV CRA', 'Art. 13 CRA', 'Art. 14 CRA', 'Art. 15 CRA', 'Art. 64 CRA', '(EU) 2024/2847'],
tags = ARRAY['security-controls', 'annex-i', 'secure-by-design', 'authentifizierung', 'kryptografie', 'sbom', 'vulnerability', 'patching', 'incident-response', 'produktklassifizierung', 'iso-27001', 'ssdlc'],
relevance = 'critical',
updated_at = NOW()
WHERE id = 'cra-security-controls';
-- ============================================================================
-- 2) Neuer Artikel: CRA-Konformitaetsbewertung — Praktischer Leitfaden
-- ============================================================================
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
('cra-konformitaet', 'cra',
'CRA-Konformitaetsbewertung — Praktischer Leitfaden',
'Schritt-fuer-Schritt-Anleitung zur CRA-Konformitaetsbewertung: Produktklassifizierung, Dokumentation, Self-Assessment vs. Drittanbieter-Pruefung, CE-Kennzeichnung.',
'## Ueberblick
Jeder Hersteller muss vor dem Inverkehrbringen eine **Konformitaetsbewertung** durchfuehren, um nachzuweisen, dass sein Produkt die Essential Cybersecurity Requirements (Annex I) erfuellt. Der Aufwand haengt von der Produktkategorie ab.
## Schritt 1: Produkt klassifizieren
Bestimmen Sie, ob Ihr Produkt unter eine der Sonderkategorien faellt:
### Entscheidungsbaum
```
Ist das Produkt in Annex IV gelistet?
Ja: Kritisches Produkt Europaeische Zertifizierung (EUCC)
Nein: Weiter
Ist das Produkt in Annex III, Klasse II gelistet?
Ja: Wichtig Klasse II Drittanbieter-Bewertung (Pflicht)
Nein: Weiter
Ist das Produkt in Annex III, Klasse I gelistet?
Ja: Wichtig Klasse I Harmonisierte Standards ODER Drittanbieter
Nein: Standardprodukt Selbstbewertung (Modul A)
```
## Schritt 2: Cybersecurity-Risikobewertung
Fuehren Sie eine systematische Risikoanalyse durch:
1. **Assets identifizieren** Welche Daten verarbeitet das Produkt? Welche Schnittstellen hat es?
2. **Bedrohungen analysieren** STRIDE-Methodik oder vergleichbar anwenden
3. **Schwachstellen bewerten** Bekannte CVEs, Design-Schwaechen, Konfigurationsfehler
4. **Risiken priorisieren** Eintrittswahrscheinlichkeit × Auswirkung
5. **Massnahmen definieren** Welche Controls aus Annex I adressieren welches Risiko?
## Schritt 3: Controls implementieren
Setzen Sie die relevanten Controls aus den 8 Kategorien um (siehe Artikel CRA Annex I Essential Cybersecurity Requirements"). Dokumentieren Sie fuer jeden Control:
- **Status**: Implementiert / In Bearbeitung / Nicht anwendbar
- **Nachweis**: Wie wird die Umsetzung belegt? (Code, Konfiguration, Test, Policy)
- **Verantwortlich**: Wer ist zustaendig?
## Schritt 4: Technische Dokumentation
Die technische Dokumentation muss enthalten:
- Beschreibung des Produkts und seiner Funktionen
- Cybersecurity-Risikobewertung
- Angewandte harmonisierte Normen
- Nachweis der Einhaltung jeder Annex-I-Anforderung
- SBOM (Software Bill of Materials)
- Informationen zum Support-Zeitraum
## Schritt 5: Konformitaetserklaerung und CE-Kennzeichnung
Nach erfolgreicher Bewertung:
1. **EU-Konformitaetserklaerung** ausstellen
2. **CE-Kennzeichnung** anbringen
3. **Dokumentation** mindestens 10 Jahre aufbewahren
4. Produkt darf in der EU vertrieben werden
## Haeufige Fehler
| Fehler | Konsequenz |
|--------|-----------|
| Default-Passwoerter nicht entfernt | Verstoss gegen Annex I, 1(3)(d) |
| Kein SBOM erstellt | Verstoss gegen Annex I, 1(5) |
| Kein Update-Mechanismus | Verstoss gegen Annex I, 1(4) |
| Keine CVD-Policy | Verstoss gegen Annex I, 2(5) |
| Support-Zeitraum nicht definiert | Verstoss gegen Art. 13(8) |
## Empfehlung
Nutzen Sie die **BreakPilot Compliance SDK Control Library**, um den Umsetzungsstand Ihrer CRA-Controls systematisch zu tracken und automatisiert Nachweise zu generieren.',
ARRAY['Annex I CRA', 'Annex II CRA', 'Annex III CRA', 'Annex IV CRA', 'Annex V CRA', 'Art. 13 CRA', 'Art. 24 CRA', 'Art. 25 CRA', 'Art. 26 CRA', 'Art. 27 CRA'],
ARRAY['konformitaet', 'ce-kennzeichnung', 'self-assessment', 'technische-dokumentation', 'sbom', 'risikobewertung'],
'important',
ARRAY['https://eur-lex.europa.eu/eli/reg/2024/2847/oj/eng'])
ON CONFLICT (id) DO NOTHING;
@@ -0,0 +1,120 @@
-- Migration 060: Multi-Layer Control Architecture — DB Schema
-- Adds obligation_extractions, control_patterns, and crosswalk_matrix tables.
-- Extends canonical_controls with pattern_id and obligation_ids columns.
--
-- Part of the Multi-Layer Control Architecture (Phase 1 of 8).
-- See: Legal Source → Obligation → Control Pattern → Master Control → Customer Instance
-- =============================================================================
-- 1. Obligation Extractions
-- Tracks how each RAG chunk was linked to an obligation (exact, embedding, LLM).
-- =============================================================================
CREATE TABLE IF NOT EXISTS obligation_extractions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
chunk_hash VARCHAR(64) NOT NULL,
collection VARCHAR(100) NOT NULL,
regulation_code VARCHAR(100) NOT NULL,
article VARCHAR(100),
paragraph VARCHAR(100),
obligation_id VARCHAR(50),
obligation_text TEXT,
confidence NUMERIC(3,2) CHECK (confidence >= 0 AND confidence <= 1),
extraction_method VARCHAR(30) NOT NULL
CHECK (extraction_method IN ('exact_match', 'embedding_match', 'llm_extracted', 'inferred')),
pattern_id VARCHAR(50),
pattern_match_score NUMERIC(3,2) CHECK (pattern_match_score >= 0 AND pattern_match_score <= 1),
control_uuid UUID REFERENCES canonical_controls(id),
job_id UUID REFERENCES canonical_generation_jobs(id),
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_oe_obligation ON obligation_extractions(obligation_id);
CREATE INDEX IF NOT EXISTS idx_oe_pattern ON obligation_extractions(pattern_id);
CREATE INDEX IF NOT EXISTS idx_oe_control ON obligation_extractions(control_uuid);
CREATE INDEX IF NOT EXISTS idx_oe_regulation ON obligation_extractions(regulation_code);
CREATE INDEX IF NOT EXISTS idx_oe_chunk ON obligation_extractions(chunk_hash);
CREATE INDEX IF NOT EXISTS idx_oe_method ON obligation_extractions(extraction_method);
COMMENT ON TABLE obligation_extractions IS
'Tracks chunk-to-obligation linkage from the 3-tier extraction pipeline (exact/embedding/LLM)';
-- =============================================================================
-- 2. Control Patterns Registry
-- DB mirror of the YAML pattern library for SQL queries and joins.
-- =============================================================================
CREATE TABLE IF NOT EXISTS control_patterns (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
pattern_id VARCHAR(50) UNIQUE NOT NULL,
name VARCHAR(255) NOT NULL,
name_de VARCHAR(255),
domain VARCHAR(10) NOT NULL,
category VARCHAR(50),
description TEXT,
template_objective TEXT,
template_rationale TEXT,
template_requirements JSONB DEFAULT '[]',
template_test_procedure JSONB DEFAULT '[]',
template_evidence JSONB DEFAULT '[]',
severity_default VARCHAR(20)
CHECK (severity_default IN ('low', 'medium', 'high', 'critical')),
implementation_effort_default VARCHAR(2)
CHECK (implementation_effort_default IN ('s', 'm', 'l', 'xl')),
obligation_match_keywords JSONB DEFAULT '[]',
tags JSONB DEFAULT '[]',
open_anchor_refs JSONB DEFAULT '[]',
composable_with JSONB DEFAULT '[]',
version VARCHAR(10) DEFAULT '1.0',
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_cp_domain ON control_patterns(domain);
CREATE INDEX IF NOT EXISTS idx_cp_category ON control_patterns(category);
CREATE INDEX IF NOT EXISTS idx_cp_pattern_id ON control_patterns(pattern_id);
COMMENT ON TABLE control_patterns IS
'Registry of control patterns (DB mirror of YAML library). Pattern ID format: CP-{DOMAIN}-{NNN}';
-- =============================================================================
-- 3. Crosswalk Matrix
-- The "golden thread" from legal source through to implementation.
-- =============================================================================
CREATE TABLE IF NOT EXISTS crosswalk_matrix (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
regulation_code VARCHAR(100) NOT NULL,
article VARCHAR(100),
paragraph VARCHAR(100),
obligation_id VARCHAR(50),
pattern_id VARCHAR(50),
master_control_id VARCHAR(20),
master_control_uuid UUID REFERENCES canonical_controls(id),
tom_control_id VARCHAR(30),
confidence NUMERIC(3,2) CHECK (confidence >= 0 AND confidence <= 1),
source VARCHAR(30) DEFAULT 'auto'
CHECK (source IN ('manual', 'auto', 'migrated')),
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_cw_regulation ON crosswalk_matrix(regulation_code, article);
CREATE INDEX IF NOT EXISTS idx_cw_obligation ON crosswalk_matrix(obligation_id);
CREATE INDEX IF NOT EXISTS idx_cw_pattern ON crosswalk_matrix(pattern_id);
CREATE INDEX IF NOT EXISTS idx_cw_control ON crosswalk_matrix(master_control_id);
CREATE INDEX IF NOT EXISTS idx_cw_tom ON crosswalk_matrix(tom_control_id);
COMMENT ON TABLE crosswalk_matrix IS
'Golden thread: regulation → article → obligation → pattern → master control → TOM';
-- =============================================================================
-- 4. Extend canonical_controls with pattern + obligation linkage
-- =============================================================================
ALTER TABLE canonical_controls
ADD COLUMN IF NOT EXISTS pattern_id VARCHAR(50);
ALTER TABLE canonical_controls
ADD COLUMN IF NOT EXISTS obligation_ids JSONB DEFAULT '[]';
CREATE INDEX IF NOT EXISTS idx_cc_pattern ON canonical_controls(pattern_id);
@@ -0,0 +1,49 @@
-- Migration 061: Obligation Candidates + Decomposition Tracking
-- Supports Pass 0a (Obligation Extraction from Rich Controls) and
-- Pass 0b (Atomic Control Composition).
--
-- Part of the Multi-Layer Control Architecture — Decomposition Pass.
-- =============================================================================
-- 1. Obligation Candidates
-- Individual normative obligations extracted from Rich Controls (Pass 0a).
-- =============================================================================
CREATE TABLE IF NOT EXISTS obligation_candidates (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
candidate_id VARCHAR(30) NOT NULL,
obligation_text TEXT NOT NULL,
action VARCHAR(500),
object TEXT,
condition TEXT,
normative_strength VARCHAR(20) DEFAULT 'must'
CHECK (normative_strength IN ('must', 'should', 'may')),
is_test_obligation BOOLEAN DEFAULT FALSE,
is_reporting_obligation BOOLEAN DEFAULT FALSE,
extraction_confidence NUMERIC(3,2) DEFAULT 0.0
CHECK (extraction_confidence >= 0 AND extraction_confidence <= 1),
quality_flags JSONB DEFAULT '{}',
release_state VARCHAR(30) DEFAULT 'extracted'
CHECK (release_state IN ('extracted', 'validated', 'rejected', 'composed')),
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_oc_parent ON obligation_candidates(parent_control_uuid);
CREATE INDEX IF NOT EXISTS idx_oc_state ON obligation_candidates(release_state);
CREATE INDEX IF NOT EXISTS idx_oc_candidate ON obligation_candidates(candidate_id);
COMMENT ON TABLE obligation_candidates IS
'Individual normative obligations extracted from Rich Controls via Pass 0a decomposition';
-- =============================================================================
-- 2. Extend canonical_controls for decomposition tracking
-- =============================================================================
ALTER TABLE canonical_controls
ADD COLUMN IF NOT EXISTS parent_control_uuid UUID REFERENCES canonical_controls(id);
ALTER TABLE canonical_controls
ADD COLUMN IF NOT EXISTS decomposition_method VARCHAR(30);
CREATE INDEX IF NOT EXISTS idx_cc_parent ON canonical_controls(parent_control_uuid);
@@ -0,0 +1,22 @@
-- Migration 062: Add pipeline_version to track which generation rules produced each control/chunk
--
-- v1 = Original pipeline (local LLM prefilter, old prompt without null-skip)
-- v2 = Improved pipeline (skip_prefilter, Anthropic decides relevance, annexes protected)
--
-- This allows identifying controls that may need reprocessing when pipeline rules change.
ALTER TABLE canonical_controls
ADD COLUMN IF NOT EXISTS pipeline_version smallint NOT NULL DEFAULT 1;
ALTER TABLE canonical_processed_chunks
ADD COLUMN IF NOT EXISTS pipeline_version smallint NOT NULL DEFAULT 1;
-- Index for efficient querying by version
CREATE INDEX IF NOT EXISTS idx_canonical_controls_pipeline_version
ON canonical_controls (pipeline_version);
CREATE INDEX IF NOT EXISTS idx_canonical_processed_chunks_pipeline_version
ON canonical_processed_chunks (pipeline_version);
COMMENT ON COLUMN canonical_controls.pipeline_version IS 'Generation pipeline version: 1=original (local prefilter), 2=improved (Anthropic decides relevance, annexes protected)';
COMMENT ON COLUMN canonical_processed_chunks.pipeline_version IS 'Pipeline version used when this chunk was processed';
@@ -0,0 +1,23 @@
-- Migration 063: Scoped Control Applicability
--
-- Adds 3 new JSONB columns to canonical_controls for filtering controls
-- based on customer industry, company size, and compliance scope.
--
-- v3 pipeline generates these fields automatically via LLM.
-- Old controls (v1/v2) will be backfilled separately.
ALTER TABLE canonical_controls
ADD COLUMN IF NOT EXISTS applicable_industries JSONB DEFAULT NULL,
ADD COLUMN IF NOT EXISTS applicable_company_size JSONB DEFAULT NULL,
ADD COLUMN IF NOT EXISTS scope_conditions JSONB DEFAULT NULL;
-- GIN index for JSONB containment queries (e.g. applicable_industries @> '"Telekommunikation"')
CREATE INDEX IF NOT EXISTS idx_cc_applicable_industries
ON canonical_controls USING gin (applicable_industries);
CREATE INDEX IF NOT EXISTS idx_cc_applicable_company_size
ON canonical_controls USING gin (applicable_company_size);
COMMENT ON COLUMN canonical_controls.applicable_industries IS 'Industries this control applies to, e.g. ["all"] or ["Telekommunikation", "Energie"]. NULL = not yet classified.';
COMMENT ON COLUMN canonical_controls.applicable_company_size IS 'Company sizes this control applies to, e.g. ["all"] or ["medium", "large", "enterprise"]. NULL = not yet classified.';
COMMENT ON COLUMN canonical_controls.scope_conditions IS 'Optional scope conditions, e.g. {"requires_any": ["uses_ai"], "description": "..."}. NULL = no conditions.';
@@ -0,0 +1,105 @@
-- Migration 064: VVT Master Libraries — 8 global reference tables
-- These are shared across all tenants (no tenant_id).
BEGIN;
-- 1. Data Subjects (Betroffenenkategorien)
CREATE TABLE IF NOT EXISTS vvt_lib_data_subjects (
id VARCHAR(50) PRIMARY KEY,
label_de VARCHAR(200) NOT NULL,
description_de TEXT,
art9_relevant BOOLEAN DEFAULT FALSE,
typical_for JSONB DEFAULT '[]'::jsonb,
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- 2. Data Categories (Datenkategorien — hierarchisch)
CREATE TABLE IF NOT EXISTS vvt_lib_data_categories (
id VARCHAR(50) PRIMARY KEY,
parent_id VARCHAR(50) REFERENCES vvt_lib_data_categories(id) ON DELETE SET NULL,
label_de VARCHAR(200) NOT NULL,
description_de TEXT,
is_art9 BOOLEAN DEFAULT FALSE,
is_art10 BOOLEAN DEFAULT FALSE,
risk_weight INTEGER DEFAULT 1 CHECK (risk_weight BETWEEN 1 AND 5),
default_retention_rule VARCHAR(50),
default_legal_basis VARCHAR(50),
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_vvt_lib_data_categories_parent ON vvt_lib_data_categories(parent_id);
-- 3. Recipients (Empfaengerkategorien)
CREATE TABLE IF NOT EXISTS vvt_lib_recipients (
id VARCHAR(50) PRIMARY KEY,
type VARCHAR(20) NOT NULL CHECK (type IN ('INTERNAL', 'PROCESSOR', 'CONTROLLER', 'AUTHORITY')),
label_de VARCHAR(200) NOT NULL,
description_de TEXT,
is_third_country BOOLEAN DEFAULT FALSE,
country VARCHAR(5),
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- 4. Legal Bases (Rechtsgrundlagen)
CREATE TABLE IF NOT EXISTS vvt_lib_legal_bases (
id VARCHAR(50) PRIMARY KEY,
article VARCHAR(50) NOT NULL,
type VARCHAR(30) NOT NULL CHECK (type IN ('CONSENT', 'CONTRACT', 'LEGAL_OBLIGATION', 'VITAL_INTEREST', 'PUBLIC_TASK', 'LEGITIMATE_INTEREST', 'ART9', 'NATIONAL')),
label_de VARCHAR(300) NOT NULL,
description_de TEXT,
is_art9 BOOLEAN DEFAULT FALSE,
typical_national_law VARCHAR(100),
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- 5. Retention Rules (Aufbewahrungsfristen)
CREATE TABLE IF NOT EXISTS vvt_lib_retention_rules (
id VARCHAR(50) PRIMARY KEY,
label_de VARCHAR(300) NOT NULL,
description_de TEXT,
legal_basis VARCHAR(200),
duration INTEGER NOT NULL,
duration_unit VARCHAR(10) NOT NULL CHECK (duration_unit IN ('DAYS', 'MONTHS', 'YEARS')),
start_event VARCHAR(200),
deletion_procedure VARCHAR(500),
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- 6. Transfer Mechanisms (Uebermittlungsmechanismen)
CREATE TABLE IF NOT EXISTS vvt_lib_transfer_mechanisms (
id VARCHAR(50) PRIMARY KEY,
label_de VARCHAR(300) NOT NULL,
description_de TEXT,
article VARCHAR(50),
requires_tia BOOLEAN DEFAULT FALSE,
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- 7. Purposes (Verarbeitungszwecke)
CREATE TABLE IF NOT EXISTS vvt_lib_purposes (
id VARCHAR(50) PRIMARY KEY,
label_de VARCHAR(300) NOT NULL,
description_de TEXT,
typical_legal_basis VARCHAR(50),
typical_for JSONB DEFAULT '[]'::jsonb,
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- 8. TOMs (Technisch-Organisatorische Massnahmen)
CREATE TABLE IF NOT EXISTS vvt_lib_toms (
id VARCHAR(50) PRIMARY KEY,
category VARCHAR(30) NOT NULL CHECK (category IN ('accessControl', 'confidentiality', 'integrity', 'availability', 'separation')),
label_de VARCHAR(300) NOT NULL,
description_de TEXT,
art32_reference VARCHAR(100),
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
COMMIT;
@@ -0,0 +1,200 @@
-- Migration 065: VVT Library Seed Data (~150 entries)
-- All content self-authored, MIT-compatible.
BEGIN;
-- =============================================================================
-- Data Subjects (15)
-- =============================================================================
INSERT INTO vvt_lib_data_subjects (id, label_de, description_de, art9_relevant, typical_for, sort_order) VALUES
('EMPLOYEES', 'Beschaeftigte', 'Aktuelle Mitarbeiterinnen und Mitarbeiter', FALSE, '["hr","it_operations"]', 1),
('APPLICANTS', 'Bewerber', 'Stellenbewerberinnen und -bewerber', FALSE, '["hr"]', 2),
('CUSTOMERS', 'Kunden', 'Aktive Kundinnen und Kunden', FALSE, '["sales_crm","support","finance"]', 3),
('PROSPECTIVE_CUSTOMERS', 'Interessenten', 'Potenzielle Kundinnen und Kunden', FALSE, '["marketing","sales_crm"]', 4),
('SUPPLIERS', 'Lieferanten', 'Geschaeftspartner als Lieferanten', FALSE, '["finance"]', 5),
('BUSINESS_PARTNERS', 'Geschaeftspartner', 'Kooperationspartner, Berater, Dienstleister', FALSE, '["management","finance"]', 6),
('VISITORS', 'Besucher', 'Betriebsbesucher und Gaeste', FALSE, '["management"]', 7),
('WEBSITE_USERS', 'Website-Nutzer', 'Besucher der Unternehmenswebsite', FALSE, '["marketing","it_operations"]', 8),
('APP_USERS', 'App-Nutzer', 'Nutzer mobiler Anwendungen', FALSE, '["product_engineering"]', 9),
('NEWSLETTER_SUBSCRIBERS', 'Newsletter-Abonnenten', 'Empfaenger von Newslettern', FALSE, '["marketing"]', 10),
('MEMBERS', 'Mitglieder', 'Vereins- oder Verbandsmitglieder', FALSE, '["management"]', 11),
('PATIENTS', 'Patienten', 'Patientinnen und Patienten', TRUE, '["other"]', 12),
('STUDENTS', 'Schueler/Studierende', 'Lernende in Bildungseinrichtungen', FALSE, '["other"]', 13),
('MINORS', 'Minderjaehrige', 'Personen unter 16 Jahren (Art. 8 DSGVO)', FALSE, '["other"]', 14),
('OTHER', 'Sonstige', 'Andere Betroffenenkategorien', FALSE, '[]', 15)
ON CONFLICT (id) DO NOTHING;
-- =============================================================================
-- Data Categories — Parent categories (9)
-- =============================================================================
INSERT INTO vvt_lib_data_categories (id, parent_id, label_de, description_de, is_art9, is_art10, risk_weight, sort_order) VALUES
('IDENTIFICATION', NULL, 'Identifikationsdaten', 'Daten zur Identifizierung natuerlicher Personen', FALSE, FALSE, 2, 1),
('CONTACT_DATA', NULL, 'Kontaktdaten', 'Kommunikationsdaten und Adressen', FALSE, FALSE, 1, 2),
('FINANCIAL', NULL, 'Finanzdaten', 'Bank-, Gehalts- und Zahlungsdaten', FALSE, FALSE, 3, 3),
('EMPLOYMENT', NULL, 'Beschaeftigungsdaten', 'Arbeitsverhaeltnis und Qualifikation', FALSE, FALSE, 2, 4),
('DIGITAL_IDENTITY', NULL, 'Digitale Identitaet', 'Online-Kennungen und Zugangsdaten', FALSE, FALSE, 2, 5),
('COMMUNICATION', NULL, 'Kommunikationsdaten', 'Nachrichten und Vertragsdaten', FALSE, FALSE, 2, 6),
('MEDIA', NULL, 'Medien- und Standortdaten', 'Bild, Video, Standort', FALSE, FALSE, 3, 7),
('ART9_SPECIAL', NULL, 'Besondere Kategorien (Art. 9)', 'Besonders schuetzenswerte Daten', TRUE, FALSE, 5, 8),
('ART10', NULL, 'Strafrechtliche Daten (Art. 10)', 'Daten ueber strafrechtliche Verurteilungen', FALSE, TRUE, 5, 9)
ON CONFLICT (id) DO NOTHING;
-- =============================================================================
-- Data Categories — Child categories (26)
-- =============================================================================
INSERT INTO vvt_lib_data_categories (id, parent_id, label_de, description_de, is_art9, is_art10, risk_weight, default_retention_rule, default_legal_basis, sort_order) VALUES
('NAME', 'IDENTIFICATION', 'Name', 'Vor- und Nachname, Geburtsname', FALSE, FALSE, 1, NULL, NULL, 10),
('DOB', 'IDENTIFICATION', 'Geburtsdatum', 'Geburtstag und -ort', FALSE, FALSE, 2, NULL, NULL, 11),
('ADDRESS', 'CONTACT_DATA', 'Anschrift', 'Wohn- und Postadresse', FALSE, FALSE, 1, NULL, NULL, 20),
('CONTACT', 'CONTACT_DATA', 'Kontaktinformationen', 'Telefon, E-Mail, Fax', FALSE, FALSE, 1, NULL, NULL, 21),
('ID_NUMBER', 'IDENTIFICATION', 'Ausweisnummer', 'Personalausweis-, Reisepassnummer', FALSE, FALSE, 3, NULL, NULL, 12),
('SOCIAL_SECURITY', 'IDENTIFICATION', 'Sozialversicherungsnummer', 'SV-Nummer', FALSE, FALSE, 4, 'BDSG_35_DELETE', 'ART6_1C', 13),
('TAX_ID', 'FINANCIAL', 'Steuer-ID', 'Steueridentifikationsnummer', FALSE, FALSE, 3, 'AO_147_10Y', 'ART6_1C', 30),
('BANK_ACCOUNT', 'FINANCIAL', 'Bankverbindung', 'IBAN, BIC, Kontonummer', FALSE, FALSE, 3, 'HGB_257_10Y', 'ART6_1B', 31),
('PAYMENT_DATA', 'FINANCIAL', 'Zahlungsdaten', 'Kreditkartendaten, Zahlungshistorie', FALSE, FALSE, 4, 'HGB_257_10Y', 'ART6_1B', 32),
('SALARY_DATA', 'FINANCIAL', 'Gehaltsdaten', 'Brutto/Netto, Zulagen, Abzuege', FALSE, FALSE, 4, 'AO_147_10Y', 'BDSG_26', 33),
('EMPLOYMENT_DATA', 'EMPLOYMENT', 'Arbeitsvertragsdaten', 'Vertragsdetails, Position, Abteilung', FALSE, FALSE, 2, 'HGB_257_10Y', 'BDSG_26', 40),
('EDUCATION_DATA', 'EMPLOYMENT', 'Ausbildungsdaten', 'Zeugnisse, Qualifikationen, Zertifikate', FALSE, FALSE, 2, 'AGG_15_6M', 'BDSG_26', 41),
('IP_ADDRESS', 'DIGITAL_IDENTITY', 'IP-Adresse', 'IPv4/IPv6 Adressen', FALSE, FALSE, 2, 'CUSTOM_90D', 'ART6_1F', 50),
('DEVICE_ID', 'DIGITAL_IDENTITY', 'Geraete-ID', 'Browser-Fingerprint, Device-ID', FALSE, FALSE, 2, 'CUSTOM_14M', 'ART6_1A', 51),
('LOGIN_DATA', 'DIGITAL_IDENTITY', 'Zugangsdaten', 'Benutzername, Passwort-Hash', FALSE, FALSE, 3, NULL, 'ART6_1B', 52),
('USAGE_DATA', 'DIGITAL_IDENTITY', 'Nutzungsdaten', 'Klickverhalten, Seitenaufrufe, Sessions', FALSE, FALSE, 2, 'CUSTOM_14M', 'ART6_1A', 53),
('COMMUNICATION_DATA', 'COMMUNICATION', 'Korrespondenz', 'E-Mails, Chat-Nachrichten, Briefe', FALSE, FALSE, 2, 'BGB_195_3Y', NULL, 60),
('CONTRACT_DATA', 'COMMUNICATION', 'Vertragsdaten', 'Vertragsdetails, Bestellungen', FALSE, FALSE, 2, 'HGB_257_10Y', 'ART6_1B', 61),
('PHOTO_VIDEO', 'MEDIA', 'Bild-/Videodaten', 'Fotos, Videos von Personen', FALSE, FALSE, 3, 'CONSENT_REVOKE', 'ART6_1A', 70),
('LOCATION_DATA', 'MEDIA', 'Standortdaten', 'GPS-Koordinaten, Aufenthaltsorte', FALSE, FALSE, 3, 'CUSTOM_90D', 'ART6_1A', 71),
('HEALTH_DATA', 'ART9_SPECIAL', 'Gesundheitsdaten', 'Krankheitsdaten, Atteste, Behinderung', TRUE, FALSE, 5, 'BDSG_35_DELETE', 'ART9_2H', 80),
('GENETIC_DATA', 'ART9_SPECIAL', 'Genetische Daten', 'DNA-Analysen, genetische Merkmale', TRUE, FALSE, 5, 'BDSG_35_DELETE', 'ART9_2A', 81),
('BIOMETRIC_DATA', 'ART9_SPECIAL', 'Biometrische Daten', 'Fingerabdruck, Gesichtserkennung', TRUE, FALSE, 5, 'BDSG_35_DELETE', 'ART9_2A', 82),
('RACIAL_ETHNIC', 'ART9_SPECIAL', 'Rassische/ethnische Herkunft', 'Ethnische Zugehoerigkeit', TRUE, FALSE, 5, NULL, 'ART9_2A', 83),
('POLITICAL_OPINIONS', 'ART9_SPECIAL', 'Politische Meinungen', 'Parteizugehoerigkeit, politische Haltung', TRUE, FALSE, 5, NULL, 'ART9_2A', 84),
('RELIGIOUS_BELIEFS', 'ART9_SPECIAL', 'Religioese Ueberzeugungen', 'Konfession, religioese Praktiken', TRUE, FALSE, 5, NULL, 'ART9_2A', 85),
('TRADE_UNION', 'ART9_SPECIAL', 'Gewerkschaftszugehoerigkeit', 'Mitgliedschaft in Gewerkschaften', TRUE, FALSE, 5, NULL, 'ART9_2A', 86),
('SEX_LIFE', 'ART9_SPECIAL', 'Sexualleben/Orientierung', 'Sexuelle Orientierung', TRUE, FALSE, 5, NULL, 'ART9_2A', 87),
('CRIMINAL_DATA', 'ART10', 'Strafrechtliche Daten', 'Verurteilungen, Straftaten, Fuehrungszeugnis', FALSE, TRUE, 5, 'BDSG_35_DELETE', 'BDSG_24', 90)
ON CONFLICT (id) DO NOTHING;
-- =============================================================================
-- Legal Bases (12)
-- =============================================================================
INSERT INTO vvt_lib_legal_bases (id, article, type, label_de, description_de, is_art9, typical_national_law, sort_order) VALUES
('ART6_1A', 'Art. 6 Abs. 1 lit. a', 'CONSENT', 'Einwilligung', 'Die betroffene Person hat ihre Einwilligung gegeben', FALSE, NULL, 1),
('ART6_1B', 'Art. 6 Abs. 1 lit. b', 'CONTRACT', 'Vertragserfullung', 'Erforderlich fuer die Erfuellung eines Vertrags', FALSE, NULL, 2),
('ART6_1C', 'Art. 6 Abs. 1 lit. c', 'LEGAL_OBLIGATION', 'Rechtliche Verpflichtung', 'Erforderlich zur Erfuellung einer rechtlichen Verpflichtung', FALSE, NULL, 3),
('ART6_1D', 'Art. 6 Abs. 1 lit. d', 'VITAL_INTEREST', 'Lebenswichtige Interessen', 'Schutz lebenswichtiger Interessen', FALSE, NULL, 4),
('ART6_1E', 'Art. 6 Abs. 1 lit. e', 'PUBLIC_TASK', 'Oeffentliches Interesse', 'Wahrnehmung einer Aufgabe im oeffentlichen Interesse', FALSE, NULL, 5),
('ART6_1F', 'Art. 6 Abs. 1 lit. f', 'LEGITIMATE_INTEREST', 'Berechtigtes Interesse', 'Wahrung berechtigter Interessen des Verantwortlichen', FALSE, NULL, 6),
('ART9_2A', 'Art. 9 Abs. 2 lit. a', 'ART9', 'Ausdrueckliche Einwilligung (Art. 9)', 'Ausdrueckliche Einwilligung fuer besondere Kategorien', TRUE, NULL, 7),
('ART9_2B', 'Art. 9 Abs. 2 lit. b', 'ART9', 'Arbeitsrecht (Art. 9)', 'Erforderlich im Arbeitsrecht', TRUE, 'BDSG § 26', 8),
('ART9_2H', 'Art. 9 Abs. 2 lit. h', 'ART9', 'Gesundheitsvorsorge (Art. 9)', 'Gesundheitsvorsorge oder Arbeitsmedizin', TRUE, NULL, 9),
('BDSG_26', '§ 26 BDSG', 'NATIONAL', 'Beschaeftigtenverhaeltnis', 'Datenverarbeitung fuer Zwecke des Beschaeftigungsverhaeltnisses', FALSE, 'BDSG § 26', 10),
('BDSG_24', '§ 24 BDSG', 'NATIONAL', 'Strafrechtliche Daten', 'Verarbeitung strafrechtlicher Daten (Art. 10 DSGVO)', FALSE, 'BDSG § 24', 11),
('UWG_7', '§ 7 UWG', 'NATIONAL', 'Werbung mit Einwilligung', 'Werbliche Ansprache nach UWG', FALSE, 'UWG § 7', 12)
ON CONFLICT (id) DO NOTHING;
-- =============================================================================
-- Retention Rules (12)
-- =============================================================================
INSERT INTO vvt_lib_retention_rules (id, label_de, description_de, legal_basis, duration, duration_unit, start_event, deletion_procedure, sort_order) VALUES
('HGB_257_10Y', '10 Jahre (HGB § 257)', 'Handelsrechtliche Aufbewahrungspflicht fuer Handelsbuecher, Jahresabschluesse, Buchungsbelege', 'HGB § 257', 10, 'YEARS', 'Ende des Kalenderjahres', 'Vernichtung nach Ablauf der Aufbewahrungsfrist', 1),
('AO_147_10Y', '10 Jahre (AO § 147)', 'Steuerrechtliche Aufbewahrungspflicht fuer Buchungsbelege', 'AO § 147', 10, 'YEARS', 'Ende des Kalenderjahres', 'Vernichtung nach Ablauf der Aufbewahrungsfrist', 2),
('AO_147_6Y', '6 Jahre (AO § 147)', 'Steuerrechtliche Aufbewahrungspflicht fuer Geschaeftsbriefe', 'AO § 147', 6, 'YEARS', 'Ende des Kalenderjahres', 'Vernichtung nach Ablauf der Aufbewahrungsfrist', 3),
('AGG_15_6M', '6 Monate (AGG § 15)', 'Frist fuer Schadensersatzansprueche nach AGG', 'AGG § 15', 6, 'MONTHS', 'Ablehnung / Ende des Verfahrens', 'Loeschung personenbezogener Bewerbungsdaten', 4),
('ARBZG_16_2Y', '2 Jahre (ArbZG § 16)', 'Aufzeichnungspflicht der Arbeitszeiten', 'ArbZG § 16', 2, 'YEARS', 'Ende des Aufzeichnungszeitraums', 'Vernichtung der Arbeitszeitaufzeichnungen', 5),
('BGB_195_3Y', '3 Jahre (BGB § 195)', 'Regelverjaehrungsfrist fuer vertragliche Ansprueche', 'BGB § 195', 3, 'YEARS', 'Ende des Jahres der Anspruchsentstehung', 'Loeschung nach Ablauf der Verjaehrungsfrist', 6),
('CONSENT_REVOKE', 'Bis Widerruf', 'Speicherung bis zum Widerruf der Einwilligung', 'Art. 7 Abs. 3 DSGVO', 0, 'DAYS', 'Widerruf der Einwilligung', 'Unverzuegliche Loeschung nach Widerruf', 7),
('PURPOSE_END', 'Bis Zweckerfuellung', 'Speicherung bis der Verarbeitungszweck erreicht ist', 'Art. 5 Abs. 1 lit. e DSGVO', 0, 'DAYS', 'Zweckerfuellung', 'Loeschung nach Zweckerfuellung', 8),
('BDSG_35_DELETE', 'Unverzuegliche Loeschung', 'Loeschung sobald Speicherung nicht mehr erforderlich', 'BDSG § 35', 0, 'DAYS', 'Wegfall der Erforderlichkeit', 'Unverzuegliche Loeschung', 9),
('CUSTOM_90D', '90 Tage', 'Benutzerdefinierte Aufbewahrungsfrist von 90 Tagen', NULL, 90, 'DAYS', 'Erstellung des Datensatzes', 'Automatische Loeschung nach 90 Tagen', 10),
('CUSTOM_14M', '14 Monate', 'Benutzerdefinierte Aufbewahrungsfrist von 14 Monaten (z.B. Analytics)', NULL, 14, 'MONTHS', 'Erstellung des Datensatzes', 'Automatische Loeschung nach 14 Monaten', 11),
('CUSTOM_30D', '30 Tage', 'Benutzerdefinierte Aufbewahrungsfrist von 30 Tagen', NULL, 30, 'DAYS', 'Erstellung des Datensatzes', 'Automatische Loeschung nach 30 Tagen', 12)
ON CONFLICT (id) DO NOTHING;
-- =============================================================================
-- Recipients (15)
-- =============================================================================
INSERT INTO vvt_lib_recipients (id, type, label_de, description_de, is_third_country, country, sort_order) VALUES
('INTERNAL_HR', 'INTERNAL', 'Personalabteilung', 'Interne HR-Abteilung', FALSE, 'DE', 1),
('INTERNAL_FINANCE', 'INTERNAL', 'Finanzabteilung', 'Interne Buchhaltung und Finanzen', FALSE, 'DE', 2),
('INTERNAL_IT', 'INTERNAL', 'IT-Abteilung', 'Interne IT-Administration', FALSE, 'DE', 3),
('INTERNAL_MANAGEMENT', 'INTERNAL', 'Geschaeftsfuehrung', 'Geschaeftsfuehrung und Vorstand', FALSE, 'DE', 4),
('INTERNAL_MARKETING', 'INTERNAL', 'Marketingabteilung', 'Internes Marketing-Team', FALSE, 'DE', 5),
('INTERNAL_SUPPORT', 'INTERNAL', 'Kundenservice', 'Interner Support und Service', FALSE, 'DE', 6),
('PROCESSOR_PAYROLL', 'PROCESSOR', 'Lohnabrechnungsdienstleister', 'Externer Gehaltsabrechnungs-Dienstleister', FALSE, 'DE', 7),
('PROCESSOR_HOSTING', 'PROCESSOR', 'Hosting-Provider', 'Cloud- oder Server-Hosting-Anbieter', FALSE, NULL, 8),
('PROCESSOR_ANALYTICS', 'PROCESSOR', 'Analytics-Anbieter', 'Web-Analytics und Tracking-Dienstleister', FALSE, NULL, 9),
('PROCESSOR_EMAIL', 'PROCESSOR', 'E-Mail-Dienstleister', 'Newsletter- und E-Mail-Versand-Anbieter', FALSE, NULL, 10),
('PROCESSOR_HELPDESK', 'PROCESSOR', 'Helpdesk-Anbieter', 'Ticketsystem- und Support-Plattform', FALSE, NULL, 11),
('AUTHORITY_FINANZAMT', 'AUTHORITY', 'Finanzamt', 'Zustaendiges Finanzamt', FALSE, 'DE', 12),
('AUTHORITY_SOZIALVERSICHERUNG', 'AUTHORITY', 'Sozialversicherungstraeger', 'Renten-, Kranken-, Arbeitslosen-, Pflegeversicherung', FALSE, 'DE', 13),
('AUTHORITY_KRANKENKASSE', 'AUTHORITY', 'Krankenkasse', 'Gesetzliche oder private Krankenkasse', FALSE, 'DE', 14),
('AUTHORITY_DATENSCHUTZ', 'AUTHORITY', 'Datenschutzbehoerde', 'Zustaendige Datenschutz-Aufsichtsbehoerde', FALSE, 'DE', 15)
ON CONFLICT (id) DO NOTHING;
-- =============================================================================
-- Transfer Mechanisms (8)
-- =============================================================================
INSERT INTO vvt_lib_transfer_mechanisms (id, label_de, description_de, article, requires_tia, sort_order) VALUES
('ADEQUACY_DECISION', 'Angemessenheitsbeschluss', 'EU-Angemessenheitsbeschluss gemaess Art. 45 DSGVO', 'Art. 45 DSGVO', FALSE, 1),
('SCC_CONTROLLER', 'Standardvertragsklauseln (C2C)', 'Standardvertragsklauseln Controller-zu-Controller', 'Art. 46 Abs. 2 lit. c DSGVO', TRUE, 2),
('SCC_PROCESSOR', 'Standardvertragsklauseln (C2P)', 'Standardvertragsklauseln Controller-zu-Processor', 'Art. 46 Abs. 2 lit. c DSGVO', TRUE, 3),
('BCR', 'Binding Corporate Rules', 'Verbindliche interne Datenschutzvorschriften', 'Art. 47 DSGVO', FALSE, 4),
('CONSENT_49A', 'Einwilligung (Art. 49)', 'Ausdrueckliche Einwilligung der betroffenen Person', 'Art. 49 Abs. 1 lit. a DSGVO', FALSE, 5),
('DEROGATION_49', 'Ausnahme (Art. 49)', 'Ausnahme fuer bestimmte Faelle gemaess Art. 49', 'Art. 49 DSGVO', FALSE, 6),
('DPF', 'EU-US Data Privacy Framework', 'Zertifizierung unter dem EU-US Data Privacy Framework', 'Art. 45 DSGVO (DPF)', FALSE, 7),
('TIA', 'Transfer Impact Assessment', 'Einzelfallbezogene Risikobewertung fuer Drittlandtransfers', 'Art. 46 DSGVO + Schrems II', TRUE, 8)
ON CONFLICT (id) DO NOTHING;
-- =============================================================================
-- Purposes (20)
-- =============================================================================
INSERT INTO vvt_lib_purposes (id, label_de, description_de, typical_legal_basis, typical_for, sort_order) VALUES
('EMPLOYMENT_ADMIN', 'Personalverwaltung', 'Verwaltung des Beschaeftigungsverhaeltnisses', 'BDSG_26', '["hr"]', 1),
('PAYROLL', 'Gehaltsabrechnung', 'Durchfuehrung der Lohn- und Gehaltsabrechnung', 'BDSG_26', '["hr","finance"]', 2),
('RECRUITING', 'Bewerbermanagement', 'Durchfuehrung von Bewerbungsverfahren', 'BDSG_26', '["hr"]', 3),
('TIME_TRACKING', 'Zeiterfassung', 'Erfassung und Verwaltung von Arbeitszeiten', 'ART6_1C', '["hr"]', 4),
('ACCOUNTING', 'Buchhaltung', 'Fuehrung der Handelsbuecher und Finanzberichterstattung', 'ART6_1C', '["finance"]', 5),
('INVOICING', 'Rechnungsstellung', 'Erstellung und Verwaltung von Rechnungen', 'ART6_1B', '["finance"]', 6),
('CRM', 'Kundenbeziehungsmanagement', 'Verwaltung und Pflege von Kundenbeziehungen', 'ART6_1B', '["sales_crm"]', 7),
('DIRECT_MARKETING', 'Direktmarketing', 'Newsletter-Versand und Werbemassnahmen', 'ART6_1A', '["marketing"]', 8),
('WEBSITE_ANALYTICS', 'Web-Analyse', 'Analyse des Nutzerverhaltens auf der Website', 'ART6_1A', '["marketing","it_operations"]', 9),
('CUSTOMER_SUPPORT', 'Kundenbetreuung', 'Bearbeitung von Kundenanfragen und Support-Tickets', 'ART6_1B', '["support"]', 10),
('IT_ADMIN', 'IT-Administration', 'Verwaltung der IT-Infrastruktur und Benutzerkonten', 'ART6_1F', '["it_operations"]', 11),
('BACKUP_RECOVERY', 'Datensicherung', 'Backup-Erstellung und Wiederherstellung', 'ART6_1F', '["it_operations"]', 12),
('SECURITY_MONITORING', 'Sicherheitsueberwachung', 'Log-Analyse und Intrusion Detection', 'ART6_1F', '["it_operations"]', 13),
('IAM', 'Identitaets- und Zugriffsmanagement', 'Verwaltung von Benutzeridentitaeten und Berechtigungen', 'ART6_1F', '["it_operations"]', 14),
('VIDEO_CONFERENCING', 'Videokonferenz', 'Durchfuehrung von Online-Meetings und Videokonferenzen', 'ART6_1B', '["other"]', 15),
('VISITOR_MANAGEMENT', 'Besucherverwaltung', 'Erfassung und Verwaltung von Betriebsbesuchern', 'ART6_1F', '["management"]', 16),
('PAYMENT_PROCESSING', 'Zahlungsabwicklung', 'Verarbeitung und Abwicklung von Zahlungen', 'ART6_1B', '["finance"]', 17),
('SOCIAL_MEDIA', 'Social-Media-Marketing', 'Betrieb von Social-Media-Praesenzen', 'ART6_1A', '["marketing"]', 18),
('SALES_REPORTING', 'Vertriebssteuerung', 'Vertriebsanalysen und Berichterstattung', 'ART6_1F', '["sales_crm"]', 19),
('COMPLIANCE_DOCS', 'Compliance-Dokumentation', 'Erstellung und Pflege von Compliance-Dokumenten', 'ART6_1C', '["legal","management"]', 20)
ON CONFLICT (id) DO NOTHING;
-- =============================================================================
-- TOMs (20)
-- =============================================================================
INSERT INTO vvt_lib_toms (id, category, label_de, description_de, art32_reference, sort_order) VALUES
('AC_RBAC', 'accessControl', 'Rollenbasierte Zugriffskontrolle (RBAC)', 'Zugriff nur nach Rolle und Berechtigung', 'Art. 32 Abs. 1 lit. b', 1),
('AC_MFA', 'accessControl', 'Multi-Faktor-Authentifizierung', 'Zwei- oder mehrstufige Anmeldung', 'Art. 32 Abs. 1 lit. b', 2),
('AC_NEED_TO_KNOW', 'accessControl', 'Need-to-Know-Prinzip', 'Zugriff nur auf fuer die Aufgabe erforderliche Daten', 'Art. 32 Abs. 1 lit. b', 3),
('AC_PAM', 'accessControl', 'Privileged Access Management', 'Verwaltung und Ueberwachung privilegierter Zugaenge', 'Art. 32 Abs. 1 lit. b', 4),
('CONF_ENCRYPTION_REST', 'confidentiality', 'Verschluesselung ruhender Daten', 'AES-256 Verschluesselung fuer gespeicherte Daten', 'Art. 32 Abs. 1 lit. a', 5),
('CONF_ENCRYPTION_TRANSIT', 'confidentiality', 'Transportverschluesselung', 'TLS 1.3 fuer alle Datenuebertragungen', 'Art. 32 Abs. 1 lit. a', 6),
('CONF_PSEUDONYMIZATION', 'confidentiality', 'Pseudonymisierung', 'Verarbeitung ohne direkten Personenbezug', 'Art. 32 Abs. 1 lit. a', 7),
('CONF_NDA', 'confidentiality', 'Vertraulichkeitsvereinbarungen', 'NDAs fuer Mitarbeiter und Auftragnehmer', 'Art. 32 Abs. 1 lit. b', 8),
('INT_AUDIT_LOG', 'integrity', 'Audit-Logging', 'Lueckenlose Protokollierung aller Datenzugriffe', 'Art. 32 Abs. 1 lit. b', 9),
('INT_FOUR_EYES', 'integrity', 'Vier-Augen-Prinzip', 'Kritische Aenderungen nur mit Freigabe durch zweite Person', 'Art. 32 Abs. 1 lit. b', 10),
('INT_CHECKSUMS', 'integrity', 'Pruefsummen und Hashing', 'Integritaetspruefung durch kryptographische Hashes', 'Art. 32 Abs. 1 lit. b', 11),
('INT_CHANGE_MGMT', 'integrity', 'Change Management', 'Dokumentierter Aenderungsprozess fuer IT-Systeme', 'Art. 32 Abs. 1 lit. b', 12),
('AVAIL_BACKUP', 'availability', 'Regelmaessige Backups', 'Taegliche und woechentliche Datensicherungen', 'Art. 32 Abs. 1 lit. c', 13),
('AVAIL_REDUNDANCY', 'availability', 'Redundante Systeme', 'Hochverfuegbarkeit durch Systemredundanz', 'Art. 32 Abs. 1 lit. c', 14),
('AVAIL_321_RULE', 'availability', '3-2-1 Backup-Regel', 'Drei Kopien, zwei Medien, ein externer Standort', 'Art. 32 Abs. 1 lit. c', 15),
('AVAIL_MONITORING', 'availability', 'System-Monitoring', 'Kontinuierliche Ueberwachung der Systemverfuegbarkeit', 'Art. 32 Abs. 1 lit. c', 16),
('SEP_TENANT_ISOLATION', 'separation', 'Mandantentrennung', 'Logische Trennung der Daten verschiedener Mandanten', 'Art. 32 Abs. 1 lit. b', 17),
('SEP_NETWORK_SEG', 'separation', 'Netzwerksegmentierung', 'Trennung von Netzwerkbereichen (VLANs, Firewalls)', 'Art. 32 Abs. 1 lit. b', 18),
('SEP_DATA_SEPARATION', 'separation', 'Datentrennung', 'Separate Datenbanken oder Schemas pro Zweck', 'Art. 32 Abs. 1 lit. b', 19),
('SEP_ENV_SEPARATION', 'separation', 'Umgebungstrennung', 'Getrennte Entwicklungs-, Test- und Produktionsumgebungen', 'Art. 32 Abs. 1 lit. b', 20)
ON CONFLICT (id) DO NOTHING;
COMMIT;
@@ -0,0 +1,54 @@
-- Migration 066: VVT Process Templates + Activity extensions
-- Template table + new ref columns on compliance_vvt_activities
BEGIN;
-- =============================================================================
-- Process Templates
-- =============================================================================
CREATE TABLE IF NOT EXISTS vvt_process_templates (
id VARCHAR(80) PRIMARY KEY,
name VARCHAR(300) NOT NULL,
description TEXT,
business_function VARCHAR(50),
purpose_refs JSONB DEFAULT '[]'::jsonb,
legal_basis_refs JSONB DEFAULT '[]'::jsonb,
data_subject_refs JSONB DEFAULT '[]'::jsonb,
data_category_refs JSONB DEFAULT '[]'::jsonb,
recipient_refs JSONB DEFAULT '[]'::jsonb,
tom_refs JSONB DEFAULT '[]'::jsonb,
transfer_mechanism_refs JSONB DEFAULT '[]'::jsonb,
retention_rule_ref VARCHAR(50),
typical_systems JSONB DEFAULT '[]'::jsonb,
protection_level VARCHAR(10) DEFAULT 'MEDIUM',
dpia_required BOOLEAN DEFAULT FALSE,
risk_score INTEGER,
tags JSONB DEFAULT '[]'::jsonb,
is_system BOOLEAN DEFAULT TRUE,
tenant_id UUID,
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_vvt_process_templates_bf ON vvt_process_templates(business_function);
CREATE INDEX IF NOT EXISTS idx_vvt_process_templates_system ON vvt_process_templates(is_system);
-- =============================================================================
-- New columns on compliance_vvt_activities (all DEFAULT NULL for backward compat)
-- =============================================================================
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS purpose_refs JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS legal_basis_refs JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS data_subject_refs JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS data_category_refs JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS recipient_refs JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS retention_rule_ref VARCHAR(50) DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS transfer_mechanism_refs JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS tom_refs JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS linked_loeschfristen_ids JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS linked_tom_measure_ids JSONB DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS source_template_id VARCHAR(80) DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS risk_score INTEGER DEFAULT NULL;
ALTER TABLE compliance_vvt_activities ADD COLUMN IF NOT EXISTS art30_completeness JSONB DEFAULT NULL;
COMMIT;
@@ -0,0 +1,305 @@
-- Migration 067: VVT Process Templates Seed — 18 templates from vvt-baseline-catalog
-- All content self-authored, MIT-compatible.
BEGIN;
INSERT INTO vvt_process_templates (id, name, description, business_function, purpose_refs, legal_basis_refs, data_subject_refs, data_category_refs, recipient_refs, tom_refs, retention_rule_ref, typical_systems, protection_level, dpia_required, risk_score, tags, sort_order) VALUES
-- HR Templates
('hr-mitarbeiterverwaltung',
'Mitarbeiterverwaltung',
'Verwaltung des Beschaeftigungsverhaeltnisses inkl. Personalakte, Urlaub, Krankmeldungen',
'hr',
'["EMPLOYMENT_ADMIN", "PAYROLL"]',
'["BDSG_26", "ART6_1B"]',
'["EMPLOYEES"]',
'["NAME", "DOB", "ADDRESS", "CONTACT", "SOCIAL_SECURITY", "BANK_ACCOUNT", "EMPLOYMENT_DATA", "HEALTH_DATA"]',
'["INTERNAL_HR", "INTERNAL_FINANCE", "PROCESSOR_PAYROLL", "AUTHORITY_SOZIALVERSICHERUNG", "AUTHORITY_KRANKENKASSE"]',
'["AC_RBAC", "AC_NEED_TO_KNOW", "CONF_ENCRYPTION_REST", "CONF_ENCRYPTION_TRANSIT", "INT_AUDIT_LOG", "SEP_TENANT_ISOLATION"]',
'HGB_257_10Y',
'["HR-Software", "Personalakte (digital)"]',
'HIGH', TRUE, 3,
'["personal", "pflicht"]',
1),
('hr-gehaltsabrechnung',
'Gehaltsabrechnung',
'Monatliche Lohn- und Gehaltsabrechnung inkl. Steuer- und Sozialversicherungsmeldungen',
'hr',
'["PAYROLL"]',
'["BDSG_26", "ART6_1C"]',
'["EMPLOYEES"]',
'["NAME", "ADDRESS", "SOCIAL_SECURITY", "TAX_ID", "BANK_ACCOUNT", "SALARY_DATA"]',
'["INTERNAL_HR", "INTERNAL_FINANCE", "PROCESSOR_PAYROLL", "AUTHORITY_FINANZAMT", "AUTHORITY_SOZIALVERSICHERUNG"]',
'["AC_RBAC", "AC_NEED_TO_KNOW", "CONF_ENCRYPTION_REST", "CONF_ENCRYPTION_TRANSIT", "INT_AUDIT_LOG", "INT_FOUR_EYES"]',
'AO_147_10Y',
'["Lohnabrechnungssoftware", "DATEV"]',
'HIGH', FALSE, 3,
'["personal", "finanzen", "pflicht"]',
2),
('hr-bewerbermanagement',
'Bewerbermanagement',
'Durchfuehrung von Bewerbungsverfahren vom Eingang bis zur Zu-/Absage',
'hr',
'["RECRUITING"]',
'["BDSG_26", "ART6_1B"]',
'["APPLICANTS"]',
'["NAME", "DOB", "ADDRESS", "CONTACT", "EDUCATION_DATA", "PHOTO_VIDEO"]',
'["INTERNAL_HR", "INTERNAL_MANAGEMENT"]',
'["AC_RBAC", "AC_NEED_TO_KNOW", "CONF_ENCRYPTION_REST", "CONF_NDA"]',
'AGG_15_6M',
'["Bewerbermanagement-Software", "E-Mail"]',
'MEDIUM', FALSE, 2,
'["personal", "recruiting"]',
3),
('hr-zeiterfassung',
'Zeiterfassung',
'Erfassung und Verwaltung von Arbeitszeiten gemaess ArbZG',
'hr',
'["TIME_TRACKING"]',
'["ART6_1C", "BDSG_26"]',
'["EMPLOYEES"]',
'["NAME", "EMPLOYMENT_DATA"]',
'["INTERNAL_HR", "INTERNAL_MANAGEMENT"]',
'["AC_RBAC", "INT_AUDIT_LOG", "CONF_ENCRYPTION_TRANSIT"]',
'ARBZG_16_2Y',
'["Zeiterfassungssystem", "Stempeluhr"]',
'LOW', FALSE, 1,
'["personal", "pflicht"]',
4),
-- Finance Templates
('finance-buchhaltung',
'Buchhaltung',
'Fuehrung der Handelsbuecher und steuerrechtliche Dokumentation',
'finance',
'["ACCOUNTING", "INVOICING"]',
'["ART6_1C", "ART6_1B"]',
'["CUSTOMERS", "SUPPLIERS", "EMPLOYEES"]',
'["NAME", "ADDRESS", "CONTACT", "BANK_ACCOUNT", "PAYMENT_DATA", "CONTRACT_DATA", "TAX_ID"]',
'["INTERNAL_FINANCE", "AUTHORITY_FINANZAMT", "PROCESSOR_HOSTING"]',
'["AC_RBAC", "INT_AUDIT_LOG", "INT_FOUR_EYES", "CONF_ENCRYPTION_REST", "AVAIL_BACKUP"]',
'HGB_257_10Y',
'["Buchhaltungssoftware", "DATEV", "ERP-System"]',
'HIGH', FALSE, 2,
'["finanzen", "pflicht"]',
5),
('finance-zahlungsverkehr',
'Zahlungsverkehr',
'Verarbeitung und Abwicklung von ein- und ausgehenden Zahlungen',
'finance',
'["PAYMENT_PROCESSING"]',
'["ART6_1B", "ART6_1C"]',
'["CUSTOMERS", "SUPPLIERS"]',
'["NAME", "BANK_ACCOUNT", "PAYMENT_DATA", "CONTRACT_DATA"]',
'["INTERNAL_FINANCE", "PROCESSOR_HOSTING"]',
'["AC_RBAC", "AC_MFA", "CONF_ENCRYPTION_REST", "CONF_ENCRYPTION_TRANSIT", "INT_AUDIT_LOG"]',
'HGB_257_10Y',
'["Online-Banking", "Payment-Gateway"]',
'HIGH', FALSE, 3,
'["finanzen"]',
6),
-- Sales/CRM Templates
('sales-kundenverwaltung',
'Kundenverwaltung',
'Verwaltung und Pflege der Kundenbeziehungen im CRM-System',
'sales_crm',
'["CRM"]',
'["ART6_1B", "ART6_1F"]',
'["CUSTOMERS", "PROSPECTIVE_CUSTOMERS"]',
'["NAME", "ADDRESS", "CONTACT", "CONTRACT_DATA", "COMMUNICATION_DATA"]',
'["INTERNAL_MARKETING", "INTERNAL_SUPPORT", "PROCESSOR_HOSTING"]',
'["AC_RBAC", "CONF_ENCRYPTION_REST", "CONF_ENCRYPTION_TRANSIT", "INT_AUDIT_LOG", "SEP_TENANT_ISOLATION"]',
'BGB_195_3Y',
'["CRM-System", "E-Mail-Client"]',
'MEDIUM', FALSE, 2,
'["vertrieb", "kunden"]',
7),
('sales-vertriebssteuerung',
'Vertriebssteuerung',
'Vertriebsanalysen, Forecasting und Berichterstattung',
'sales_crm',
'["SALES_REPORTING"]',
'["ART6_1F"]',
'["CUSTOMERS", "PROSPECTIVE_CUSTOMERS"]',
'["NAME", "CONTACT", "CONTRACT_DATA"]',
'["INTERNAL_MANAGEMENT", "INTERNAL_MARKETING"]',
'["AC_RBAC", "AC_NEED_TO_KNOW", "CONF_PSEUDONYMIZATION"]',
'BGB_195_3Y',
'["CRM-System", "BI-Tool"]',
'LOW', FALSE, 1,
'["vertrieb", "reporting"]',
8),
-- Marketing Templates
('marketing-newsletter',
'Newsletter-Versand',
'Versand von Newslettern und Werbemails an Abonnenten',
'marketing',
'["DIRECT_MARKETING"]',
'["ART6_1A", "UWG_7"]',
'["NEWSLETTER_SUBSCRIBERS", "CUSTOMERS"]',
'["NAME", "CONTACT", "USAGE_DATA"]',
'["INTERNAL_MARKETING", "PROCESSOR_EMAIL"]',
'["AC_RBAC", "CONF_ENCRYPTION_TRANSIT", "SEP_DATA_SEPARATION"]',
'CONSENT_REVOKE',
'["Newsletter-Tool", "E-Mail-Marketing-Plattform"]',
'LOW', FALSE, 1,
'["marketing", "einwilligung"]',
9),
('marketing-website-analytics',
'Website-Analyse',
'Analyse des Nutzerverhaltens auf der Unternehmenswebsite',
'marketing',
'["WEBSITE_ANALYTICS"]',
'["ART6_1A"]',
'["WEBSITE_USERS"]',
'["IP_ADDRESS", "DEVICE_ID", "USAGE_DATA"]',
'["INTERNAL_MARKETING", "PROCESSOR_ANALYTICS"]',
'["CONF_PSEUDONYMIZATION", "CONF_ENCRYPTION_TRANSIT", "SEP_DATA_SEPARATION"]',
'CUSTOM_14M',
'["Web-Analytics-Tool", "Tag-Manager"]',
'LOW', FALSE, 1,
'["marketing", "einwilligung", "tracking"]',
10),
('marketing-social-media',
'Social-Media-Marketing',
'Betrieb und Verwaltung von Social-Media-Praesenzen',
'marketing',
'["SOCIAL_MEDIA"]',
'["ART6_1A", "ART6_1F"]',
'["WEBSITE_USERS", "CUSTOMERS"]',
'["NAME", "CONTACT", "USAGE_DATA", "PHOTO_VIDEO"]',
'["INTERNAL_MARKETING", "PROCESSOR_ANALYTICS"]',
'["AC_RBAC", "CONF_ENCRYPTION_TRANSIT"]',
'PURPOSE_END',
'["Social-Media-Plattformen", "Social-Media-Management-Tool"]',
'LOW', FALSE, 1,
'["marketing", "social-media"]',
11),
-- Support Templates
('support-ticketsystem',
'Ticketsystem / Kundenservice',
'Bearbeitung von Kundenanfragen ueber das Ticketsystem',
'support',
'["CUSTOMER_SUPPORT"]',
'["ART6_1B"]',
'["CUSTOMERS"]',
'["NAME", "CONTACT", "COMMUNICATION_DATA", "CONTRACT_DATA"]',
'["INTERNAL_SUPPORT", "PROCESSOR_HELPDESK"]',
'["AC_RBAC", "CONF_ENCRYPTION_TRANSIT", "INT_AUDIT_LOG"]',
'BGB_195_3Y',
'["Ticketsystem", "Help-Desk-Software"]',
'MEDIUM', FALSE, 1,
'["support", "kunden"]',
12),
-- IT Templates
('it-systemadministration',
'IT-Systemadministration',
'Verwaltung der IT-Infrastruktur, Benutzerkonten und Berechtigungen',
'it_operations',
'["IT_ADMIN"]',
'["ART6_1F", "ART6_1B"]',
'["EMPLOYEES"]',
'["NAME", "LOGIN_DATA", "IP_ADDRESS", "DEVICE_ID"]',
'["INTERNAL_IT", "PROCESSOR_HOSTING"]',
'["AC_RBAC", "AC_MFA", "AC_PAM", "CONF_ENCRYPTION_REST", "CONF_ENCRYPTION_TRANSIT", "INT_AUDIT_LOG", "SEP_NETWORK_SEG", "SEP_ENV_SEPARATION"]',
'CUSTOM_90D',
'["Active Directory", "LDAP", "IT-Management-Tool"]',
'HIGH', FALSE, 2,
'["it", "infrastruktur"]',
13),
('it-backup',
'Datensicherung und Recovery',
'Regelmaessige Backups und Wiederherstellungsverfahren',
'it_operations',
'["BACKUP_RECOVERY"]',
'["ART6_1F"]',
'["EMPLOYEES", "CUSTOMERS"]',
'["NAME", "ADDRESS", "CONTACT", "CONTRACT_DATA", "LOGIN_DATA"]',
'["INTERNAL_IT", "PROCESSOR_HOSTING"]',
'["AVAIL_BACKUP", "AVAIL_321_RULE", "AVAIL_REDUNDANCY", "CONF_ENCRYPTION_REST", "INT_CHECKSUMS"]',
'CUSTOM_90D',
'["Backup-Software", "Cloud-Backup", "NAS"]',
'HIGH', FALSE, 2,
'["it", "verfuegbarkeit"]',
14),
('it-logging',
'Logging und Sicherheitsueberwachung',
'Protokollierung von System- und Sicherheitsereignissen',
'it_operations',
'["SECURITY_MONITORING"]',
'["ART6_1F"]',
'["EMPLOYEES", "CUSTOMERS", "WEBSITE_USERS"]',
'["IP_ADDRESS", "LOGIN_DATA", "USAGE_DATA", "DEVICE_ID"]',
'["INTERNAL_IT"]',
'["CONF_ENCRYPTION_REST", "INT_AUDIT_LOG", "INT_CHECKSUMS", "AVAIL_MONITORING", "SEP_DATA_SEPARATION"]',
'CUSTOM_90D',
'["SIEM-System", "Log-Management", "Monitoring-Tool"]',
'MEDIUM', FALSE, 2,
'["it", "sicherheit"]',
15),
('it-iam',
'Identitaets- und Zugriffsmanagement',
'Verwaltung von Benutzeridentitaeten, Rollen und Berechtigungen',
'it_operations',
'["IAM"]',
'["ART6_1F", "BDSG_26"]',
'["EMPLOYEES"]',
'["NAME", "LOGIN_DATA", "EMPLOYMENT_DATA"]',
'["INTERNAL_IT", "INTERNAL_HR"]',
'["AC_RBAC", "AC_MFA", "AC_PAM", "AC_NEED_TO_KNOW", "INT_AUDIT_LOG", "CONF_ENCRYPTION_REST"]',
'AGG_15_6M',
'["IAM-System", "SSO-Provider", "Active Directory"]',
'HIGH', FALSE, 2,
'["it", "sicherheit", "zugriffskontrolle"]',
16),
-- Other Templates
('other-videokonferenz',
'Videokonferenz',
'Durchfuehrung von Online-Meetings und Videokonferenzen',
'other',
'["VIDEO_CONFERENCING"]',
'["ART6_1B", "ART6_1F"]',
'["EMPLOYEES", "CUSTOMERS", "BUSINESS_PARTNERS"]',
'["NAME", "CONTACT", "PHOTO_VIDEO", "IP_ADDRESS"]',
'["INTERNAL_IT", "PROCESSOR_HOSTING"]',
'["CONF_ENCRYPTION_TRANSIT", "AC_RBAC"]',
'PURPOSE_END',
'["Videokonferenz-Tool", "Webinar-Plattform"]',
'LOW', FALSE, 1,
'["kommunikation"]',
17),
('other-besuchermanagement',
'Besuchermanagement',
'Erfassung und Verwaltung von Betriebsbesuchern',
'other',
'["VISITOR_MANAGEMENT"]',
'["ART6_1F"]',
'["VISITORS"]',
'["NAME", "CONTACT", "PHOTO_VIDEO"]',
'["INTERNAL_MANAGEMENT"]',
'["AC_RBAC", "CONF_ENCRYPTION_REST"]',
'CUSTOM_30D',
'["Besuchermanagement-System", "Empfangsterminal"]',
'LOW', FALSE, 1,
'["sonstiges", "besucher"]',
18)
ON CONFLICT (id) DO NOTHING;
COMMIT;
@@ -0,0 +1,65 @@
-- Migration 068: TOM ↔ Canonical Control Mappings
-- Bridge table connecting TOM measures (88) to Canonical Controls (10,000+)
-- Enables three-layer architecture: TOM → Mapping → Canonical Controls
-- ============================================================================
-- 1. Mapping table (TOM control code → Canonical control)
-- ============================================================================
CREATE TABLE IF NOT EXISTS tom_control_mappings (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL,
project_id UUID,
-- TOM side (references the embedded TOM control code, e.g. 'TOM-AC-01')
tom_control_code VARCHAR(20) NOT NULL,
tom_category VARCHAR(50) NOT NULL,
-- Canonical control side
canonical_control_id UUID NOT NULL,
canonical_control_code VARCHAR(20) NOT NULL,
canonical_category VARCHAR(50),
-- Mapping metadata
mapping_type VARCHAR(20) NOT NULL DEFAULT 'auto'
CHECK (mapping_type IN ('auto', 'manual')),
relevance_score NUMERIC(3,2) DEFAULT 1.00
CHECK (relevance_score >= 0 AND relevance_score <= 1),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- No duplicate mappings per tenant+project+TOM+canonical
UNIQUE (tenant_id, project_id, tom_control_code, canonical_control_id)
);
CREATE INDEX IF NOT EXISTS idx_tcm_tenant_project
ON tom_control_mappings (tenant_id, project_id);
CREATE INDEX IF NOT EXISTS idx_tcm_tom_code
ON tom_control_mappings (tom_control_code);
CREATE INDEX IF NOT EXISTS idx_tcm_canonical_id
ON tom_control_mappings (canonical_control_id);
CREATE INDEX IF NOT EXISTS idx_tcm_tom_category
ON tom_control_mappings (tom_category);
-- ============================================================================
-- 2. Sync state (tracks when the last sync ran + profile hash)
-- ============================================================================
CREATE TABLE IF NOT EXISTS tom_control_sync_state (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL,
project_id UUID,
-- Profile hash to detect changes (SHA-256 of serialized company profile)
profile_hash VARCHAR(64),
-- Stats from last sync
total_mappings INTEGER DEFAULT 0,
canonical_controls_matched INTEGER DEFAULT 0,
tom_controls_covered INTEGER DEFAULT 0,
last_synced_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- One sync state per tenant+project
UNIQUE (tenant_id, project_id)
);
@@ -0,0 +1,3 @@
-- Obligations: Vendor-Verknuepfung fuer Art. 28 DSGVO
ALTER TABLE compliance_obligations
ADD COLUMN IF NOT EXISTS linked_vendor_ids JSONB DEFAULT '[]'::jsonb;
@@ -0,0 +1,3 @@
-- Loeschfristen: Vendor-Verknuepfung
ALTER TABLE compliance_loeschfristen
ADD COLUMN IF NOT EXISTS linked_vendor_ids JSONB DEFAULT '[]'::jsonb;
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,873 @@
-- Migration 073: Module Document Templates
-- Reference templates for VVT, TOM, Loeschfristen and Pflichten modules
-- These match the structure of the module-specific document generators
-- and enable versioning in the document-generator
-- ===========================================================================
-- Template 1: VVT — Verarbeitungsverzeichnis (Art. 30 DSGVO)
-- ===========================================================================
INSERT INTO compliance_legal_templates (
id, tenant_id, document_type, title, description, content,
placeholders, language, jurisdiction,
license_id, license_name, source_name,
attribution_required, is_complete_document, version, status,
created_at, updated_at
) SELECT
gen_random_uuid(),
'9282a473-5c95-4b3a-bf78-0ecc0ec71d3e',
'vvt_register',
'Verarbeitungsverzeichnis (Art. 30 DSGVO)',
'Vollstaendiges Verzeichnis von Verarbeitungstaetigkeiten gemaess Art. 30 Abs. 1 DSGVO. Dokumentiert alle Verarbeitungen mit Rechtsgrundlagen, Datenkategorien, Empfaengern, Drittlandtransfers und Loeschfristen.',
$template$# Verarbeitungsverzeichnis (Art. 30 DSGVO)
## Dokumentenkontrolle
| Feld | Wert |
|------|------|
| Unternehmen | {{COMPANY_NAME}} |
| Dokumenttyp | Verzeichnis von Verarbeitungstaetigkeiten |
| Version | {{DOCUMENT_VERSION}} |
| Datum | {{VERSION_DATE}} |
| Klassifizierung | Vertraulich |
| Datenschutzbeauftragter | {{DPO_NAME}} |
| Kontakt DSB | {{DPO_CONTACT}} |
| Verantwortlicher | {{RESPONSIBLE_PERSON}} |
| Naechste Pruefung | {{NEXT_REVIEW_DATE}} |
### Aenderungshistorie
| Version | Datum | Autor | Aenderung |
|---------|-------|-------|-----------|
| {{DOCUMENT_VERSION}} | {{VERSION_DATE}} | {{DPO_NAME}} | Erstfassung |
---
## 1. Ziel und Zweck
Dieses Verarbeitungsverzeichnis dient der Dokumentation aller Verarbeitungstaetigkeiten von **{{COMPANY_NAME}}** gemaess Art. 30 Abs. 1 DSGVO. Es enthaelt saemtliche Pflichtangaben und wird regelmaessig auf Vollstaendigkeit und Aktualitaet geprueft.
### Gesetzliche Grundlage
| Rechtsgrundlage | Inhalt |
|-----------------|--------|
| **Art. 30 Abs. 1 DSGVO** | Pflicht des Verantwortlichen, ein Verzeichnis aller Verarbeitungstaetigkeiten zu fuehren |
| **Art. 30 Abs. 2 DSGVO** | Pflicht des Auftragsverarbeiters, ein Verzeichnis aller Kategorien von Verarbeitungstaetigkeiten zu fuehren |
| **Art. 30 Abs. 4 DSGVO** | Bereitstellungspflicht gegenueber der Aufsichtsbehoerde |
| **Art. 5 Abs. 2 DSGVO** | Rechenschaftspflicht Nachweis der Einhaltung der DSGVO-Grundsaetze |
---
## 2. Organisation und Verantwortlichkeiten
| Rolle | Person / Abteilung |
|-------|--------------------|
| Verantwortlicher (Art. 4 Nr. 7) | {{RESPONSIBLE_PERSON}} |
| Datenschutzbeauftragter (Art. 37-39) | {{DPO_NAME}} ({{DPO_CONTACT}}) |
| VVT-Pflege | Fachabteilungen in Abstimmung mit DSB |
**Hinweis:** Jede Fachabteilung ist verpflichtet, neue Verarbeitungstaetigkeiten vor deren Beginn beim DSB zu melden. Aenderungen an bestehenden Verarbeitungen sind unverzueglich zu kommunizieren.
---
## 3. Verarbeitungstaetigkeiten (Art. 30 Abs. 1)
### Pflichtangaben je Verarbeitungstaetigkeit
Fuer jede Verarbeitungstaetigkeit werden folgende Pflichtfelder nach Art. 30 DSGVO dokumentiert:
| Pflichtfeld (Art. 30) | Beschreibung |
|------------------------|-------------|
| **VVT-Nr.** | Eindeutige Kennung der Verarbeitungstaetigkeit |
| **Bezeichnung** | Bezeichnung der Verarbeitungstaetigkeit |
| **Verantwortlicher** | Name und Kontaktdaten des Verantwortlichen |
| **Geschaeftsbereich** | Zustaendige Organisationseinheit |
| **Zwecke der Verarbeitung** | Beschreibung aller Verarbeitungszwecke |
| **Rechtsgrundlage(n)** | Art. 6 Abs. 1 lit. a-f DSGVO; ggf. Art. 9 Abs. 2 DSGVO |
| **Kategorien betroffener Personen** | z.B. Mitarbeiter, Kunden, Lieferanten, Schueler |
| **Kategorien personenbezogener Daten** | z.B. Stammdaten, Kontaktdaten, Vertragsdaten; Art. 9-Kategorien gesondert kennzeichnen |
| **Empfaengerkategorien** | Intern, extern, Auftragsverarbeiter, Behoerden |
| **Uebermittlung an Drittlaender** | Zielland, Empfaenger, Transfermechanismus (Art. 44-49) |
| **Loeschfristen** | Vorgesehene Fristen fuer die Loeschung, Rechtsgrundlage, Verfahren |
| **TOM (Art. 32)** | Beschreibung der technischen und organisatorischen Massnahmen |
### Verarbeitungsuebersicht
*Die konkreten Verarbeitungstaetigkeiten werden vom VVT-Modul automatisch in das Dokument eingefuegt. Jede Verarbeitungstaetigkeit wird als separate Detailkarte mit allen Pflichtfeldern dargestellt.*
| VVT-Nr. | Bezeichnung | Geschaeftsbereich | Rechtsgrundlage | Status |
|----------|-------------|-------------------|-----------------|--------|
| *Wird automatisch befuellt* | | | | |
### Detailkarten
Fuer jede Verarbeitungstaetigkeit wird eine Detailkarte erstellt mit:
- Alle Pflichtangaben nach Art. 30 in tabellarischer Form
- Kennzeichnung besonderer Kategorien (Art. 9 DSGVO)
- Kennzeichnung DSFA-Pflicht (Art. 35 DSGVO)
- Kennzeichnung Drittlanduebermittlung (Art. 44-49 DSGVO)
- Strukturierte TOMs nach Kategorie (Zugriffskontrolle, Vertraulichkeit, Integritaet, Verfuegbarkeit, Trennbarkeit)
- Schutzniveau und Deployment-Modell
---
## 4. Auftragsverarbeiter (Art. 30 Abs. 2)
Sofern **{{COMPANY_NAME}}** als Auftragsverarbeiter taetig ist, wird ein separates Verzeichnis nach Art. 30 Abs. 2 DSGVO gefuehrt. Dieses enthaelt:
| Pflichtfeld (Art. 30 Abs. 2) | Beschreibung |
|-------------------------------|-------------|
| Name und Kontaktdaten des Auftragsverarbeiters | {{COMPANY_NAME}} |
| Kategorien von Verarbeitungen | Art der im Auftrag durchgefuehrten Verarbeitungen |
| Name und Kontaktdaten des Verantwortlichen | Auftraggeber |
| Uebermittlungen in Drittlaender | Zielland, Empfaenger, Garantien |
| Technische und organisatorische Massnahmen | Art. 32 DSGVO |
---
## 5. TOM-Beschreibung (Art. 32 DSGVO)
Fuer jede Verarbeitungstaetigkeit werden die technischen und organisatorischen Massnahmen dokumentiert:
| Kategorie | Beschreibung |
|-----------|-------------|
| **Zugriffskontrolle** | Massnahmen zur Steuerung des Zugriffs auf personenbezogene Daten |
| **Vertraulichkeit** | Verschluesselung, Pseudonymisierung, Zutrittskontrolle |
| **Integritaet** | Eingabekontrolle, Weitergabekontrolle, Protokollierung |
| **Verfuegbarkeit** | Backup, Redundanz, Disaster Recovery |
| **Trennbarkeit** | Mandantentrennung, Zweckbindung |
**Verweis:** Die vollstaendige TOM-Dokumentation wird im separaten TOM-Modul gefuehrt und hier je Verarbeitungstaetigkeit referenziert.
---
## 6. Pruefverfahren und Revision
| Eigenschaft | Wert |
|-------------|------|
| Pruefintervall | Jaehrlich |
| Letzte Pruefung | {{VERSION_DATE}} |
| Naechste Pruefung | {{NEXT_REVIEW_DATE}} |
| Aktuelle Version | {{DOCUMENT_VERSION}} |
### Pruefpunkte
Bei jeder Pruefung wird das VVT auf folgende Punkte ueberprueft:
- Vollstaendigkeit: Sind alle Verarbeitungstaetigkeiten erfasst?
- Aktualitaet: Stimmen die Angaben noch mit der Praxis ueberein?
- Art. 30-Konformitaet: Enthalten alle Eintraege die Pflichtangaben?
- Art. 9-Kennzeichnung: Sind besondere Kategorien korrekt markiert?
- Drittlandtransfers: Sind Transfermechanismen dokumentiert?
- Loeschfristen: Sind Aufbewahrungsfristen definiert und aktuell?
- TOM-Verweise: Sind Massnahmen je Verarbeitung beschrieben?
---
*Dieses Dokument wird automatisch vom VVT-Modul generiert und enthaelt alle erfassten Verarbeitungstaetigkeiten mit vollstaendigen Pflichtangaben nach Art. 30 DSGVO.*
*Erstellt mit BreakPilot Compliance {{COMPANY_NAME}} | Stand: {{VERSION_DATE}} | Version {{DOCUMENT_VERSION}}*
$template$,
'["COMPANY_NAME","DPO_NAME","DPO_CONTACT","RESPONSIBLE_PERSON","DOCUMENT_VERSION","VERSION_DATE","NEXT_REVIEW_DATE"]'::jsonb,
'de', 'DE',
'mit', 'MIT License', 'BreakPilot Compliance',
false, true, '1.0.0', 'published',
NOW(), NOW()
WHERE NOT EXISTS (
SELECT 1 FROM compliance_legal_templates
WHERE document_type = 'vvt_register'
AND tenant_id = '9282a473-5c95-4b3a-bf78-0ecc0ec71d3e'
);
-- ===========================================================================
-- Template 2: TOM — TOM-Dokumentation (Art. 32 DSGVO)
-- ===========================================================================
INSERT INTO compliance_legal_templates (
id, tenant_id, document_type, title, description, content,
placeholders, language, jurisdiction,
license_id, license_name, source_name,
attribution_required, is_complete_document, version, status,
created_at, updated_at
) SELECT
gen_random_uuid(),
'9282a473-5c95-4b3a-bf78-0ecc0ec71d3e',
'tom_documentation',
'TOM-Dokumentation (Art. 32 DSGVO)',
'Dokumentation aller technischen und organisatorischen Massnahmen gemaess Art. 32 DSGVO. Umfasst Schutzbedarf, Risikoprofil, Massnahmenkatalog nach Kategorie, SDM-Gewaehrleistungsziele und Compliance-Status.',
$template$# TOM-Dokumentation (Art. 32 DSGVO)
## Dokumentenkontrolle
| Feld | Wert |
|------|------|
| Unternehmen | {{COMPANY_NAME}} |
| Dokumenttyp | Technische und Organisatorische Massnahmen |
| Version | {{DOCUMENT_VERSION}} |
| Datum | {{VERSION_DATE}} |
| Klassifizierung | Vertraulich |
| IT-Sicherheitsbeauftragter | {{ISB_NAME}} |
| Datenschutzbeauftragter | {{DPO_NAME}} |
| Geschaeftsfuehrung | {{GF_NAME}} |
| Naechste Pruefung | {{NEXT_REVIEW_DATE}} |
### Aenderungshistorie
| Version | Datum | Autor | Aenderung |
|---------|-------|-------|-----------|
| {{DOCUMENT_VERSION}} | {{VERSION_DATE}} | {{ISB_NAME}} | Erstfassung |
---
## 1. Ziel und Zweck
Diese TOM-Dokumentation beschreibt die technischen und organisatorischen Massnahmen zum Schutz personenbezogener Daten bei **{{COMPANY_NAME}}**. Sie dient der Umsetzung folgender DSGVO-Anforderungen:
| Rechtsgrundlage | Inhalt |
|-----------------|--------|
| **Art. 32 Abs. 1 lit. a DSGVO** | Pseudonymisierung und Verschluesselung personenbezogener Daten |
| **Art. 32 Abs. 1 lit. b DSGVO** | Vertraulichkeit, Integritaet, Verfuegbarkeit und Belastbarkeit der Systeme auf Dauer sicherstellen |
| **Art. 32 Abs. 1 lit. c DSGVO** | Rasche Wiederherstellung der Verfuegbarkeit bei physischem oder technischem Zwischenfall |
| **Art. 32 Abs. 1 lit. d DSGVO** | Regelmaessige Ueberpruefung, Bewertung und Evaluierung der Wirksamkeit der Massnahmen |
Die TOM-Dokumentation ist fester Bestandteil des Datenschutz-Managementsystems und wird regelmaessig ueberprueft und aktualisiert.
---
## 2. Geltungsbereich
Diese TOM-Dokumentation gilt fuer alle IT-Systeme, Anwendungen und Verarbeitungsprozesse von **{{COMPANY_NAME}}**. Die dokumentierten Massnahmen stammen aus zwei Quellen:
- **Embedded Library (TOM-xxx):** Integrierte Kontrollbibliothek mit spezifischen Massnahmen fuer Art. 32 DSGVO
- **Canonical Control Library (CP-CLIB):** Uebergreifende Kontrollbibliothek mit framework-uebergreifenden Massnahmen
---
## 3. Grundprinzipien Art. 32
- **Vertraulichkeit:** Schutz personenbezogener Daten vor unbefugter Kenntnisnahme durch Zutrittskontrolle, Zugangskontrolle, Zugriffskontrolle und Verschluesselung (Art. 32 Abs. 1 lit. b DSGVO).
- **Integritaet:** Sicherstellung, dass personenbezogene Daten nicht unbefugt oder unbeabsichtigt veraendert werden koennen, durch Eingabekontrolle, Weitergabekontrolle und Protokollierung (Art. 32 Abs. 1 lit. b DSGVO).
- **Verfuegbarkeit und Belastbarkeit:** Gewaehrleistung, dass Systeme und Dienste bei Lastspitzen und Stoerungen zuverlaessig funktionieren, durch Backup, Redundanz und Disaster Recovery (Art. 32 Abs. 1 lit. b DSGVO).
- **Rasche Wiederherstellbarkeit:** Faehigkeit, nach einem physischen oder technischen Zwischenfall Daten und Systeme schnell wiederherzustellen, durch getestete Recovery-Prozesse (Art. 32 Abs. 1 lit. c DSGVO).
- **Regelmaessige Wirksamkeitspruefung:** Verfahren zur regelmaessigen Ueberpruefung, Bewertung und Evaluierung der Wirksamkeit aller technischen und organisatorischen Massnahmen (Art. 32 Abs. 1 lit. d DSGVO).
---
## 4. Schutzbedarf und Risikoanalyse
Die Schutzbedarfsanalyse bildet die Grundlage fuer die Auswahl und Priorisierung der Massnahmen.
| Kriterium | Bewertung |
|-----------|-----------|
| Vertraulichkeit | *Wird vom TOM-Generator automatisch ermittelt* |
| Integritaet | *Wird vom TOM-Generator automatisch ermittelt* |
| Verfuegbarkeit | *Wird vom TOM-Generator automatisch ermittelt* |
| Schutzniveau | *Basiert auf CIA-Bewertung* |
| DSFA-Pflicht | *Wird automatisch berechnet* |
**Hinweis:** Die detaillierte Schutzbedarfsanalyse wird im TOM-Modul ueber den Risiko-Wizard durchgefuehrt. Die Ergebnisse fliessen automatisch in die Massnahmenauswahl ein.
---
## 5. Massnahmenkatalog
### 5.1 Zutrittskontrolle
Massnahmen zur Verhinderung des unbefugten Zutritts zu Datenverarbeitungsanlagen.
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.2 Zugangskontrolle
Massnahmen zur Verhinderung der unbefugten Nutzung von Datenverarbeitungssystemen.
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.3 Zugriffskontrolle
Massnahmen, die gewaehrleisten, dass ausschliesslich berechtigte Personen auf Daten zugreifen koennen.
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.4 Weitergabekontrolle
Massnahmen zum Schutz personenbezogener Daten bei elektronischer Uebertragung und Transport.
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.5 Eingabekontrolle
Massnahmen zur nachtraeglichen Ueberpruefung, ob und von wem Daten eingegeben, veraendert oder entfernt worden sind.
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.6 Auftragskontrolle
Massnahmen, die gewaehrleisten, dass personenbezogene Daten nur entsprechend den Weisungen des Auftraggebers verarbeitet werden.
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.7 Verschluesselung und Pseudonymisierung
Massnahmen zur Pseudonymisierung und Verschluesselung personenbezogener Daten (Art. 32 Abs. 1 lit. a DSGVO).
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.8 Verfuegbarkeit und Belastbarkeit
Massnahmen zur Gewaehrleistung der Verfuegbarkeit und Belastbarkeit der Systeme (Art. 32 Abs. 1 lit. b DSGVO).
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.9 Wiederherstellbarkeit
Massnahmen zur raschen Wiederherstellung der Verfuegbarkeit nach einem Zwischenfall (Art. 32 Abs. 1 lit. c DSGVO).
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
### 5.10 Ueberpruefung und Bewertung
Verfahren zur regelmaessigen Ueberpruefung, Bewertung und Evaluierung (Art. 32 Abs. 1 lit. d DSGVO).
| Massnahme | Typ | Status | Verantwortlich |
|-----------|-----|--------|----------------|
| *Wird automatisch aus dem TOM-Modul befuellt* | | | |
---
## 6. SDM Gewaehrleistungsziele
Das Standard-Datenschutzmodell (SDM) definiert sieben Gewaehrleistungsziele. Die implementierten Massnahmen decken folgende Ziele ab:
| Gewaehrleistungsziel | Abgedeckt | Gesamt | Abdeckung (%) |
|----------------------|-----------|--------|---------------|
| Verfuegbarkeit | *automatisch* | | |
| Integritaet | *automatisch* | | |
| Vertraulichkeit | *automatisch* | | |
| Nichtverkettung | *automatisch* | | |
| Intervenierbarkeit | *automatisch* | | |
| Transparenz | *automatisch* | | |
| Datenminimierung | *automatisch* | | |
---
## 7. Verantwortlichkeiten
| Rolle | Aufgabe |
|-------|---------|
| Geschaeftsfuehrung ({{GF_NAME}}) | Gesamtverantwortung, Freigabe der TOM-Dokumentation |
| IT-Sicherheitsbeauftragter ({{ISB_NAME}}) | Pflege und Umsetzung technischer Massnahmen |
| Datenschutzbeauftragter ({{DPO_NAME}}) | Ueberwachung, Beratung, Compliance-Check |
| Fachabteilungen | Umsetzung organisatorischer Massnahmen, Meldepflicht |
---
## 8. Compliance-Status
*Der aktuelle Compliance-Score wird vom TOM-Modul automatisch berechnet und enthaelt Befunde nach Schweregrad (Kritisch, Hoch, Mittel, Niedrig).*
| Kennzahl | Wert |
|----------|------|
| Gepruefte Massnahmen | *automatisch* |
| Bestanden | *automatisch* |
| Beanstandungen | *automatisch* |
---
## 9. Pruef- und Revisionszyklus
| Eigenschaft | Wert |
|-------------|------|
| Pruefintervall | Jaehrlich |
| Letzte Pruefung | {{VERSION_DATE}} |
| Naechste Pruefung | {{NEXT_REVIEW_DATE}} |
| Aktuelle Version | {{DOCUMENT_VERSION}} |
### Pruefpunkte
- Vollstaendigkeit aller Massnahmen (neue Systeme oder Verarbeitungen erfasst?)
- Aktualitaet des Umsetzungsstatus (Aenderungen seit letzter Pruefung?)
- Wirksamkeit der technischen Massnahmen (Penetration-Tests, Audit-Ergebnisse)
- Angemessenheit der organisatorischen Massnahmen (Schulungen, Richtlinien aktuell?)
- Abdeckung aller SDM-Gewaehrleistungsziele
- Zuordnung von Verantwortlichkeiten zu allen Massnahmen
---
*Dieses Dokument wird automatisch vom TOM-Modul generiert und enthaelt alle erfassten technischen und organisatorischen Massnahmen nach Art. 32 DSGVO.*
*Erstellt mit BreakPilot Compliance {{COMPANY_NAME}} | Stand: {{VERSION_DATE}} | Version {{DOCUMENT_VERSION}}*
$template$,
'["COMPANY_NAME","ISB_NAME","GF_NAME","DPO_NAME","DOCUMENT_VERSION","VERSION_DATE","NEXT_REVIEW_DATE"]'::jsonb,
'de', 'DE',
'mit', 'MIT License', 'BreakPilot Compliance',
false, true, '1.0.0', 'published',
NOW(), NOW()
WHERE NOT EXISTS (
SELECT 1 FROM compliance_legal_templates
WHERE document_type = 'tom_documentation'
AND tenant_id = '9282a473-5c95-4b3a-bf78-0ecc0ec71d3e'
);
-- ===========================================================================
-- Template 3: Loeschkonzept (Art. 5/17 DSGVO)
-- ===========================================================================
INSERT INTO compliance_legal_templates (
id, tenant_id, document_type, title, description, content,
placeholders, language, jurisdiction,
license_id, license_name, source_name,
attribution_required, is_complete_document, version, status,
created_at, updated_at
) SELECT
gen_random_uuid(),
'9282a473-5c95-4b3a-bf78-0ecc0ec71d3e',
'loeschkonzept',
'Loeschkonzept (Art. 5/17 DSGVO)',
'Systematisches Loeschkonzept gemaess Art. 5 Abs. 1 lit. e und Art. 17 DSGVO. Dokumentiert Loeschregeln, Aufbewahrungstreiber, Loeschmethoden, Legal Holds und Auftragsverarbeiter-Verknuepfungen.',
$template$# Loeschkonzept (Art. 5/17 DSGVO)
## Dokumentenkontrolle
| Feld | Wert |
|------|------|
| Unternehmen | {{COMPANY_NAME}} |
| Dokumenttyp | Loeschkonzept |
| Version | {{DOCUMENT_VERSION}} |
| Datum | {{VERSION_DATE}} |
| Klassifizierung | Vertraulich |
| Datenschutzbeauftragter | {{DPO_NAME}} |
| Kontakt DSB | {{DPO_CONTACT}} |
| Naechste Pruefung | {{NEXT_REVIEW_DATE}} |
### Aenderungshistorie
| Version | Datum | Autor | Aenderung |
|---------|-------|-------|-----------|
| {{DOCUMENT_VERSION}} | {{VERSION_DATE}} | {{DPO_NAME}} | Erstfassung |
---
## 1. Ziel und Zweck
Dieses Loeschkonzept definiert die systematischen Regeln und Verfahren fuer die Loeschung personenbezogener Daten bei **{{COMPANY_NAME}}**. Es dient der Umsetzung folgender DSGVO-Anforderungen:
| Rechtsgrundlage | Inhalt |
|-----------------|--------|
| **Art. 5 Abs. 1 lit. e DSGVO** | Grundsatz der Speicherbegrenzung Daten nur so lange speichern, wie fuer den Zweck erforderlich |
| **Art. 17 DSGVO** | Recht auf Loeschung ("Recht auf Vergessenwerden") Betroffene koennen Loeschung verlangen |
| **Art. 30 DSGVO** | Verzeichnis von Verarbeitungstaetigkeiten Loeschfristen muessen dokumentiert werden |
| **Art. 25 DSGVO** | Datenschutz durch Technikgestaltung Loeschmechanismen moeglichst automatisiert |
Das Loeschkonzept ist fester Bestandteil des Datenschutz-Managementsystems und wird regelmaessig ueberprueft und aktualisiert.
---
## 2. Rechtsgrundlagen und Aufbewahrungstreiber
### Gesetzliche Aufbewahrungspflichten
| Aufbewahrungstreiber | Gesetz / Vorschrift | Frist |
|----------------------|---------------------|-------|
| Handelsrechtliche Aufbewahrung | § 257 HGB | 6 Jahre (Handelsbriefe), 10 Jahre (Buchungsbelege) |
| Steuerrechtliche Aufbewahrung | § 147 AO | 6 Jahre (Geschaeftsbriefe), 10 Jahre (Buchungsbelege) |
| Arbeitsrechtliche Aufbewahrung | Diverse arbeitsrechtliche Vorschriften | 3-10 Jahre je nach Dokumenttyp |
| Sozialversicherungsrechtlich | §§ 28f, 110 SGB IV | 5 Jahre |
| Produkthaftung | § 10 ProdHaftG | 10 Jahre |
| Beweissicherung | §§ 195-199 BGB | 3 Jahre (regelmaessige Verjaehrung) |
### 3-Level-Loeschlogik
Die Loeschung folgt einer dreistufigen Priorisierung:
1. **Zweckende:** Daten werden geloescht, sobald der Verarbeitungszweck entfaellt
2. **Gesetzliche Aufbewahrungspflichten:** Laengere Fristen aus HGB, AO etc. ueberschreiben Zweckende
3. **Legal Hold:** Aufbewahrungspflicht aufgrund rechtlicher Verfahren setzt alle anderen Fristen aus
---
## 3. Datenkategorien und Fristen
### Loeschregeln-Uebersicht
| LF-Nr. | Datenobjekt | Loeschtrigger | Aufbewahrungsfrist | Loeschmethode | Status |
|--------|-------------|---------------|--------------------|--------------:|--------|
| *Wird automatisch vom Loeschfristen-Modul befuellt* | | | | | |
### Detaillierte Loeschregeln
Fuer jede Loeschregel werden folgende Informationen dokumentiert:
| Feld | Beschreibung |
|------|-------------|
| Beschreibung | Detaillierte Beschreibung der betroffenen Daten |
| Betroffenengruppen | Kategorien betroffener Personen |
| Datenkategorien | Art der personenbezogenen Daten |
| Verarbeitungszweck | Primaerer Zweck der Datenverarbeitung |
| Loeschtrigger | Ereignis, das die Loeschfrist ausloest |
| Aufbewahrungstreiber | Gesetzliche Grundlage fuer die Aufbewahrung |
| Aufbewahrungsfrist | Dauer der Aufbewahrung mit Einheit |
| Startereignis | Beginn der Fristberechnung |
| Loeschmethode | Technisches Verfahren (Loeschung, Anonymisierung, Vernichtung) |
| Speicherorte | Betroffene Systeme und Datenbanken |
| Verantwortlich | Person oder Rolle |
| Pruefintervall | Frequenz der Kontrolle |
---
## 4. Loeschmethoden
| Methode | Beschreibung | Anwendung |
|---------|-------------|-----------|
| **Physische Loeschung** | Unwiderrufliches Entfernen der Daten aus allen Systemen | Standard fuer nicht mehr benoetigte Daten |
| **Anonymisierung** | Entfernen des Personenbezugs, sodass Daten nicht mehr zuordenbar sind | Statistik, Forschung, Archivierung |
| **Pseudonymisierung** | Ersetzen identifizierender Merkmale durch Pseudonyme | Zwischenschritt, kein Ersatz fuer Loeschung |
| **Physische Vernichtung** | Physische Zerstoerung der Datentraeger (Shredding, Degaussing) | Datentraeger-Entsorgung |
| **Kryptographische Loeschung** | Vernichtung der Schluessel bei verschluesselten Daten | Cloud-Umgebungen, verschluesselte Backups |
---
## 5. Verantwortlichkeiten
| Rolle | Aufgabe |
|-------|---------|
| Datenschutzbeauftragter ({{DPO_NAME}}) | Ueberwachung, Beratung, Compliance-Pruefung |
| Fachabteilungen | Definition der Zweckende, Meldung neuer Datenkategorien |
| IT-Abteilung | Technische Umsetzung der Loeschmechanismen |
| Rechtsabteilung | Bewertung gesetzlicher Aufbewahrungspflichten, Legal Hold |
---
## 6. Legal Hold Verfahren
Ein Legal Hold setzt die regulaere Loeschung aus. Betroffene Daten duerfen trotz abgelaufener Frist nicht geloescht werden, bis der Hold aufgehoben wird.
### Verfahrensschritte
1. Rechtsabteilung / DSB identifiziert betroffene Datenkategorien
2. Legal Hold wird im System aktiviert (Status: Aktiv)
3. Automatische Loeschung wird fuer betroffene Policies ausgesetzt
4. Regelmaessige Pruefung, ob der Legal Hold noch erforderlich ist
5. Nach Aufhebung: Regulaere Loeschfristen greifen wieder
### Aktive Legal Holds
*Wird automatisch vom Loeschfristen-Modul befuellt. Enthaelt: Datenobjekt, Grund, Rechtsgrundlage, Beginn, voraussichtliches Ende.*
---
## 7. Auftragsverarbeiter mit Loeschpflichten
Loeschregeln, die mit Auftragsverarbeitern verknuepft sind, stellen sicher, dass auch bei extern verarbeiteten Daten die Loeschpflichten eingehalten werden (Art. 28 DSGVO).
| Loeschregel | LF-Nr. | Auftragsverarbeiter | Aufbewahrungsfrist |
|-------------|--------|--------------------|--------------------|
| *Wird automatisch vom Loeschfristen-Modul befuellt* | | | |
**Hinweis:** Die vollstaendige Auftragsverarbeiter-Dokumentation wird im Vendor-Compliance-Modul gefuehrt.
---
## 8. VVT-Verknuepfung
Die Loeschregeln sind mit den Verarbeitungstaetigkeiten im Verarbeitungsverzeichnis (Art. 30 DSGVO) verknuepft:
| Loeschregel | LF-Nr. | VVT-Nr. | Verarbeitungstaetigkeit |
|-------------|--------|---------|-------------------------|
| *Wird automatisch vom Loeschfristen-Modul befuellt* | | | |
---
## 9. Compliance-Status
*Der aktuelle Compliance-Score wird vom Loeschfristen-Modul automatisch berechnet und enthaelt Befunde nach Schweregrad (Kritisch, Hoch, Mittel, Niedrig).*
| Kennzahl | Wert |
|----------|------|
| Gepruefte Policies | *automatisch* |
| Bestanden | *automatisch* |
| Beanstandungen | *automatisch* |
---
## 10. Pruef- und Revisionszyklus
| Eigenschaft | Wert |
|-------------|------|
| Pruefintervall | Jaehrlich |
| Letzte Pruefung | {{VERSION_DATE}} |
| Naechste Pruefung | {{NEXT_REVIEW_DATE}} |
| Aktuelle Version | {{DOCUMENT_VERSION}} |
### Pruefpunkte
- Vollstaendigkeit aller Loeschregeln (neue Verarbeitungen erfasst?)
- Aktualitaet der gesetzlichen Aufbewahrungsfristen
- Wirksamkeit der technischen Loeschmechanismen
- Einhaltung der definierten Loeschfristen
- Angemessenheit der Verantwortlichkeiten
- VVT-Verknuepfung vollstaendig?
---
*Dieses Dokument wird automatisch vom Loeschfristen-Modul generiert und enthaelt alle erfassten Loeschregeln mit Aufbewahrungstreibern, Fristen und Verantwortlichkeiten.*
*Erstellt mit BreakPilot Compliance {{COMPANY_NAME}} | Stand: {{VERSION_DATE}} | Version {{DOCUMENT_VERSION}}*
$template$,
'["COMPANY_NAME","DPO_NAME","DPO_CONTACT","DOCUMENT_VERSION","VERSION_DATE","NEXT_REVIEW_DATE"]'::jsonb,
'de', 'DE',
'mit', 'MIT License', 'BreakPilot Compliance',
false, true, '1.0.0', 'published',
NOW(), NOW()
WHERE NOT EXISTS (
SELECT 1 FROM compliance_legal_templates
WHERE document_type = 'loeschkonzept'
AND tenant_id = '9282a473-5c95-4b3a-bf78-0ecc0ec71d3e'
);
-- ===========================================================================
-- Template 4: Pflichtenregister (DSGVO/AI-Act)
-- ===========================================================================
INSERT INTO compliance_legal_templates (
id, tenant_id, document_type, title, description, content,
placeholders, language, jurisdiction,
license_id, license_name, source_name,
attribution_required, is_complete_document, version, status,
created_at, updated_at
) SELECT
gen_random_uuid(),
'9282a473-5c95-4b3a-bf78-0ecc0ec71d3e',
'pflichtenregister',
'Pflichtenregister (DSGVO/AI-Act)',
'Vollstaendiges Pflichtenregister fuer alle regulatorischen Pflichten aus DSGVO, AI Act, NIS2 und BDSG. Dokumentiert Pflichten, Verantwortlichkeiten, Fristen, Nachweise und Compliance-Status.',
$template$# Pflichtenregister (DSGVO / AI Act / NIS2)
## Dokumentenkontrolle
| Feld | Wert |
|------|------|
| Unternehmen | {{COMPANY_NAME}} |
| Dokumenttyp | Pflichtenregister |
| Version | {{DOCUMENT_VERSION}} |
| Datum | {{VERSION_DATE}} |
| Klassifizierung | Vertraulich |
| Datenschutzbeauftragter | {{DPO_NAME}} |
| Kontakt DSB | {{DPO_CONTACT}} |
| Verantwortlicher | {{RESPONSIBLE_PERSON}} |
| Rechtsabteilung | {{LEGAL_DEPARTMENT}} |
| Naechste Pruefung | {{NEXT_REVIEW_DATE}} |
### Aenderungshistorie
| Version | Datum | Autor | Aenderung |
|---------|-------|-------|-----------|
| {{DOCUMENT_VERSION}} | {{VERSION_DATE}} | {{DPO_NAME}} | Erstfassung |
---
## 1. Ziel und Zweck
Dieses Pflichtenregister dokumentiert alle regulatorischen Pflichten, denen **{{COMPANY_NAME}}** unterliegt. Es dient der systematischen Erfassung, Ueberwachung und Nachverfolgung aller Compliance-Anforderungen aus den anwendbaren Regulierungen.
### Zwecke des Registers
- Vollstaendige Erfassung aller anwendbaren regulatorischen Pflichten
- Zuordnung von Verantwortlichkeiten und Fristen
- Nachverfolgung des Umsetzungsstatus
- Dokumentation von Nachweisen fuer Audits
- Identifikation von Compliance-Luecken und Handlungsbedarf
### Rechtsrahmen
| Rechtsrahmen | Relevanz |
|-------------|----------|
| **DSGVO (EU) 2016/679** | Datenschutz-Grundverordnung Kernregulierung fuer personenbezogene Daten |
| **AI Act (EU) 2024/1689** | KI-Verordnung Anforderungen an KI-Systeme nach Risikoklasse |
| **NIS2 (EU) 2022/2555** | Netzwerk- und Informationssicherheit Cybersicherheitspflichten |
| **BDSG** | Bundesdatenschutzgesetz Nationale Ergaenzung zur DSGVO |
---
## 2. Geltungsbereich
Dieses Pflichtenregister gilt fuer alle Geschaeftsprozesse und IT-Systeme von **{{COMPANY_NAME}}**. Es umfasst Pflichten aus allen anwendbaren Regulierungen, gruppiert nach Rechtsquelle.
### Anwendbare Regulierungen
| Regulierung | Anzahl Pflichten | Status |
|-------------|-----------------|--------|
| *Wird automatisch vom Pflichtenregister-Modul befuellt* | | |
---
## 3. Methodik
Die Identifikation und Bewertung der Pflichten erfolgt in drei Schritten:
1. **Pflicht-Identifikation:** Systematische Analyse aller anwendbaren Regulierungen und Extraktion der einzelnen Pflichten mit Artikel-Referenz, Beschreibung und Zielgruppe.
2. **Bewertung und Priorisierung:** Jede Pflicht wird nach Prioritaet (kritisch, hoch, mittel, niedrig) und Dringlichkeit (Frist) bewertet. Die Bewertung basiert auf dem Risikopotenzial bei Nichterfuellung.
3. **Ueberwachung und Nachverfolgung:** Regelmaessige Pruefung des Umsetzungsstatus, Aktualisierung der Fristen und Dokumentation von Nachweisen.
Die Pflichten werden ueber einen automatisierten Compliance-Check geprueft, der 11 Kriterien umfasst (siehe Abschnitt 10: Compliance-Status).
---
## 4. Regulatorische Grundlagen
| Regulierung | Pflichten | Kritisch | Hoch | Mittel | Niedrig | Abgeschlossen |
|-------------|----------|----------|------|--------|---------|---------------|
| *Wird automatisch vom Pflichtenregister-Modul befuellt* | | | | | | |
---
## 5. Pflichtenuebersicht
Uebersicht aller Pflichten nach Regulierung und Status:
| Regulierung | Gesamt | Ausstehend | In Bearbeitung | Abgeschlossen | Ueberfaellig |
|-------------|--------|------------|----------------|---------------|--------------|
| *Wird automatisch vom Pflichtenregister-Modul befuellt* | | | | | |
---
## 6. Detaillierte Pflichten
Fuer jede Pflicht werden folgende Informationen als Detailkarte dokumentiert:
| Feld | Beschreibung |
|------|-------------|
| Rechtsquelle | Regulierung und Artikel-Referenz |
| Beschreibung | Detaillierte Beschreibung der Pflicht |
| Prioritaet | Kritisch / Hoch / Mittel / Niedrig |
| Status | Ausstehend / In Bearbeitung / Abgeschlossen / Ueberfaellig |
| Verantwortlich | Person oder Abteilung |
| Frist | Umsetzungsfrist |
| Nachweise | Dokumentierte Belege fuer die Umsetzung |
| Betroffene Systeme | IT-Systeme, die von der Pflicht betroffen sind |
| Notizen | Zusaetzliche Anmerkungen und Handlungsempfehlungen |
### Pflichten nach Regulierung
*Die einzelnen Pflichten werden vom Pflichtenregister-Modul automatisch nach Rechtsquelle gruppiert und als Detailkarten mit allen Feldern in das Dokument eingefuegt. Die Sortierung erfolgt nach Prioritaet (kritisch zuerst).*
---
## 7. Verantwortlichkeiten
| Verantwortlich | Pflichten | Anzahl | Davon offen |
|----------------|----------|--------|-------------|
| *Wird automatisch vom Pflichtenregister-Modul befuellt* | | | |
### Rollenmatrix
| Rolle | Aufgabe |
|-------|---------|
| Verantwortlicher ({{RESPONSIBLE_PERSON}}) | Gesamtverantwortung fuer Compliance |
| Datenschutzbeauftragter ({{DPO_NAME}}) | Ueberwachung DSGVO-Pflichten, Beratung |
| Rechtsabteilung ({{LEGAL_DEPARTMENT}}) | Bewertung regulatorischer Aenderungen, NIS2/AI-Act |
| Fachabteilungen | Umsetzung zugewiesener Pflichten |
| IT-Abteilung | Umsetzung technischer Anforderungen |
---
## 8. Fristen-Uebersicht
### Ueberfaellige Pflichten
| Pflicht | Regulierung | Frist | Tage ueberfaellig | Prioritaet |
|---------|-------------|-------|--------------------:|-----------|
| *Wird automatisch vom Pflichtenregister-Modul befuellt* | | | | |
### Anstehende Fristen
| Pflicht | Regulierung | Frist | Verbleibend | Verantwortlich |
|---------|-------------|-------|-------------|----------------|
| *Wird automatisch vom Pflichtenregister-Modul befuellt* | | | | |
---
## 9. Nachweisregister
Dokumentation der Nachweise (Evidence) fuer die Umsetzung der Pflichten:
| Pflicht | Regulierung | Nachweise | Status |
|---------|-------------|-----------|--------|
| *Wird automatisch vom Pflichtenregister-Modul befuellt* | | | |
### Pflichten ohne Nachweise
*Das Modul identifiziert automatisch alle Pflichten, fuer die noch keine Nachweise hinterlegt wurden, und listet diese als Handlungsbedarf auf.*
---
## 10. Compliance-Status
*Der aktuelle Compliance-Score wird vom Pflichtenregister-Modul automatisch berechnet. Der Check umfasst 11 Kriterien und bewertet Befunde nach Schweregrad (Kritisch, Hoch, Mittel, Niedrig).*
| Kennzahl | Wert |
|----------|------|
| Compliance-Score | *automatisch (0-100)* |
| Befunde gesamt | *automatisch* |
| Kritisch | *automatisch* |
| Hoch | *automatisch* |
| Mittel | *automatisch* |
| Niedrig | *automatisch* |
### Befunde und Empfehlungen
| Schweregrad | Befund | Betroffene Pflichten | Empfehlung |
|-------------|--------|---------------------|------------|
| *Wird automatisch vom Compliance-Check befuellt* | | | |
---
## 11. Pruef- und Revisionszyklus
| Eigenschaft | Wert |
|-------------|------|
| Pruefintervall | Jaehrlich |
| Letzte Pruefung | {{VERSION_DATE}} |
| Naechste Pruefung | {{NEXT_REVIEW_DATE}} |
| Aktuelle Version | {{DOCUMENT_VERSION}} |
### Pruefpunkte
- Vollstaendigkeit: Sind alle anwendbaren Pflichten erfasst?
- Aktualitaet: Gibt es neue Regulierungen oder Gesetzesaenderungen?
- Umsetzungsstatus: Sind ueberfaellige Pflichten eskaliert?
- Nachweise: Sind fuer alle abgeschlossenen Pflichten Belege hinterlegt?
- Verantwortlichkeiten: Sind alle Pflichten zugewiesen?
- Fristen: Sind neue Fristen aus Gesetzesaenderungen beruecksichtigt?
---
*Dieses Dokument wird automatisch vom Pflichtenregister-Modul generiert und enthaelt alle erfassten regulatorischen Pflichten mit Verantwortlichkeiten, Fristen und Nachweisen.*
*Erstellt mit BreakPilot Compliance {{COMPANY_NAME}} | Stand: {{VERSION_DATE}} | Version {{DOCUMENT_VERSION}}*
$template$,
'["COMPANY_NAME","DPO_NAME","DPO_CONTACT","RESPONSIBLE_PERSON","LEGAL_DEPARTMENT","DOCUMENT_VERSION","VERSION_DATE","NEXT_REVIEW_DATE"]'::jsonb,
'de', 'DE',
'mit', 'MIT License', 'BreakPilot Compliance',
false, true, '1.0.0', 'published',
NOW(), NOW()
WHERE NOT EXISTS (
SELECT 1 FROM compliance_legal_templates
WHERE document_type = 'pflichtenregister'
AND tenant_id = '9282a473-5c95-4b3a-bf78-0ecc0ec71d3e'
);
@@ -0,0 +1,73 @@
-- Migration 074: Control Dedup Engine — DB Schema
-- Supports the 4-stage dedup pipeline for atomic controls (Pass 0b).
--
-- Tables:
-- 1. control_parent_links — M:N parent linking (one control → many regulations)
-- 2. control_dedup_reviews — Review queue for borderline matches (0.85-0.92)
BEGIN;
-- =============================================================================
-- 1. Control Parent Links (M:N)
-- Enables "1 Control erfuellt 5 Gesetze" — the biggest USP.
-- An atomic control can have multiple parent controls from different
-- regulations/obligations. This replaces the 1:1 parent_control_uuid FK.
-- =============================================================================
CREATE TABLE IF NOT EXISTS control_parent_links (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition'
CHECK (link_type IN ('decomposition', 'dedup_merge', 'manual', 'crosswalk')),
confidence NUMERIC(3,2) DEFAULT 1.0
CHECK (confidence >= 0 AND confidence <= 1),
source_regulation VARCHAR(100),
source_article VARCHAR(100),
obligation_candidate_id UUID REFERENCES obligation_candidates(id),
created_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
);
CREATE INDEX IF NOT EXISTS idx_cpl_control ON control_parent_links(control_uuid);
CREATE INDEX IF NOT EXISTS idx_cpl_parent ON control_parent_links(parent_control_uuid);
CREATE INDEX IF NOT EXISTS idx_cpl_type ON control_parent_links(link_type);
COMMENT ON TABLE control_parent_links IS
'M:N parent links — one atomic control can fulfill multiple regulations/obligations. USP: "1 Control erfuellt 5 Gesetze"';
-- =============================================================================
-- 2. Control Dedup Reviews
-- Queue for borderline matches (similarity 0.85-0.92) that need human review.
-- Reviewed entries get status updated to accepted/rejected.
-- =============================================================================
CREATE TABLE IF NOT EXISTS control_dedup_reviews (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
candidate_control_id VARCHAR(30) NOT NULL,
candidate_title TEXT NOT NULL,
candidate_objective TEXT,
matched_control_uuid UUID REFERENCES canonical_controls(id),
matched_control_id VARCHAR(30),
similarity_score NUMERIC(4,3) DEFAULT 0.0,
dedup_stage VARCHAR(40) NOT NULL,
dedup_details JSONB DEFAULT '{}',
parent_control_uuid UUID REFERENCES canonical_controls(id),
obligation_candidate_id UUID REFERENCES obligation_candidates(id),
review_status VARCHAR(20) DEFAULT 'pending'
CHECK (review_status IN ('pending', 'accepted_link', 'accepted_new', 'rejected')),
reviewed_by VARCHAR(100),
reviewed_at TIMESTAMPTZ,
review_notes TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_cdr_status ON control_dedup_reviews(review_status);
CREATE INDEX IF NOT EXISTS idx_cdr_matched ON control_dedup_reviews(matched_control_uuid);
CREATE INDEX IF NOT EXISTS idx_cdr_parent ON control_dedup_reviews(parent_control_uuid);
CREATE INDEX IF NOT EXISTS idx_cdr_stage ON control_dedup_reviews(dedup_stage);
COMMENT ON TABLE control_dedup_reviews IS
'Review queue for borderline dedup matches (similarity 0.85-0.92). Human decides: link or new control.';
COMMIT;
@@ -0,0 +1,38 @@
-- Migration 075: Obligation Refinement Fields
-- Supports Merge Pass (implementation-level dedup) and metadata enrichment.
--
-- New fields:
-- merged_into_id — points to survivor obligation when merged
-- trigger_type — event / periodic / continuous
-- is_implementation_specific — true if obligation references concrete tool/protocol
-- =============================================================================
-- 1. Add merge tracking
-- =============================================================================
ALTER TABLE obligation_candidates
ADD COLUMN IF NOT EXISTS merged_into_id UUID
REFERENCES obligation_candidates(id);
CREATE INDEX IF NOT EXISTS idx_oc_merged_into
ON obligation_candidates(merged_into_id)
WHERE merged_into_id IS NOT NULL;
-- Allow 'merged' as release_state
ALTER TABLE obligation_candidates
DROP CONSTRAINT IF EXISTS obligation_candidates_release_state_check;
ALTER TABLE obligation_candidates
ADD CONSTRAINT obligation_candidates_release_state_check
CHECK (release_state IN ('extracted', 'validated', 'rejected', 'composed', 'merged'));
-- =============================================================================
-- 2. Add enrichment metadata
-- =============================================================================
ALTER TABLE obligation_candidates
ADD COLUMN IF NOT EXISTS trigger_type VARCHAR(20) DEFAULT NULL
CHECK (trigger_type IS NULL OR trigger_type IN ('event', 'periodic', 'continuous'));
ALTER TABLE obligation_candidates
ADD COLUMN IF NOT EXISTS is_implementation_specific BOOLEAN DEFAULT FALSE;
@@ -0,0 +1,125 @@
-- Migration 076: Anti-Fake-Evidence Guardrails (Phase 1)
--
-- Prevents "Compliance-Theater": generated content passed off as real evidence,
-- controls without evidence marked as "pass", unvalidated 100% compliance claims.
--
-- Changes:
-- 1. New ENUM types for evidence confidence + truth status
-- 2. New columns on compliance_evidence (confidence, truth, review tracking)
-- 3. New value 'in_progress' for controlstatusenum
-- 4. status_justification column on compliance_controls
-- 5. New table compliance_llm_generation_audit
-- 6. Backfill existing evidence based on source
-- 7. Indexes on new columns
-- ============================================================================
-- 1. New ENUM types
-- ============================================================================
-- NOTE: CREATE TYPE cannot run inside a transaction block when combined with
-- ALTER TYPE ... ADD VALUE. Each statement here is auto-committed separately
-- when executed outside a transaction (which is the default for psql scripts).
CREATE TYPE evidence_confidence_level AS ENUM (
'E0', -- Generated / no real evidence (LLM output, placeholder)
'E1', -- Uploaded but unreviewed (manual upload, no hash, no reviewer)
'E2', -- Reviewed internally (human reviewed, hash verified)
'E3', -- Observed by system (CI/CD pipeline, API with hash)
'E4' -- Validated by external auditor
);
CREATE TYPE evidence_truth_status AS ENUM (
'generated', -- Created by LLM / system generation
'uploaded', -- Manually uploaded by user
'observed', -- Automatically observed (CI/CD, monitoring)
'validated_internal', -- Reviewed + approved by internal reviewer
'rejected', -- Reviewed and rejected
'provided_to_auditor', -- Shared with external auditor
'accepted_by_auditor' -- Accepted by external auditor
);
-- ============================================================================
-- 2. Add 'in_progress' to controlstatusenum
-- ============================================================================
-- ALTER TYPE ... ADD VALUE cannot run inside a transaction.
ALTER TYPE controlstatusenum ADD VALUE IF NOT EXISTS 'in_progress';
-- ============================================================================
-- 3. New columns on compliance_evidence
-- ============================================================================
ALTER TABLE compliance_evidence
ADD COLUMN IF NOT EXISTS confidence_level evidence_confidence_level DEFAULT 'E1',
ADD COLUMN IF NOT EXISTS truth_status evidence_truth_status DEFAULT 'uploaded',
ADD COLUMN IF NOT EXISTS generation_mode VARCHAR(100),
ADD COLUMN IF NOT EXISTS may_be_used_as_evidence BOOLEAN DEFAULT TRUE,
ADD COLUMN IF NOT EXISTS reviewed_by VARCHAR(200),
ADD COLUMN IF NOT EXISTS reviewed_at TIMESTAMPTZ;
-- ============================================================================
-- 4. status_justification on compliance_controls
-- ============================================================================
ALTER TABLE compliance_controls
ADD COLUMN IF NOT EXISTS status_justification TEXT;
-- ============================================================================
-- 5. LLM Generation Audit table
-- ============================================================================
CREATE TABLE IF NOT EXISTS compliance_llm_generation_audit (
id VARCHAR(36) PRIMARY KEY DEFAULT gen_random_uuid()::text,
tenant_id VARCHAR(36),
entity_type VARCHAR(50) NOT NULL, -- 'evidence', 'control', 'document', ...
entity_id VARCHAR(36), -- FK to the generated entity
generation_mode VARCHAR(100) NOT NULL, -- 'draft_assistance', 'auto_generation', ...
truth_status evidence_truth_status NOT NULL DEFAULT 'generated',
may_be_used_as_evidence BOOLEAN NOT NULL DEFAULT FALSE,
llm_model VARCHAR(100),
llm_provider VARCHAR(50), -- 'ollama', 'anthropic', ...
prompt_hash VARCHAR(64), -- SHA-256 of the prompt
input_summary TEXT, -- Truncated input for auditability
output_summary TEXT, -- Truncated output for auditability
metadata JSONB DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- ============================================================================
-- 6. Backfill existing evidence based on source
-- ============================================================================
-- CI pipeline evidence → E3 + observed
UPDATE compliance_evidence
SET confidence_level = 'E3',
truth_status = 'observed'
WHERE source = 'ci_pipeline'
AND confidence_level = 'E1';
-- API evidence → E3 + observed
UPDATE compliance_evidence
SET confidence_level = 'E3',
truth_status = 'observed'
WHERE source = 'api'
AND confidence_level = 'E1';
-- Manual/upload evidence stays at E1 + uploaded (default)
-- Generated evidence → E0 + generated
UPDATE compliance_evidence
SET confidence_level = 'E0',
truth_status = 'generated',
may_be_used_as_evidence = FALSE
WHERE source = 'generated'
AND confidence_level = 'E1';
-- ============================================================================
-- 7. Indexes
-- ============================================================================
CREATE INDEX IF NOT EXISTS ix_evidence_confidence ON compliance_evidence (confidence_level);
CREATE INDEX IF NOT EXISTS ix_evidence_truth_status ON compliance_evidence (truth_status);
CREATE INDEX IF NOT EXISTS ix_evidence_may_be_used ON compliance_evidence (may_be_used_as_evidence);
CREATE INDEX IF NOT EXISTS ix_llm_audit_entity ON compliance_llm_generation_audit (entity_type, entity_id);
CREATE INDEX IF NOT EXISTS ix_llm_audit_tenant ON compliance_llm_generation_audit (tenant_id);
@@ -0,0 +1,37 @@
-- Migration 077: Anti-Fake-Evidence Phase 2
-- Assertions table, Four-Eyes columns on Evidence, Audit-Trail performance index
-- 1A. Assertions table
CREATE TABLE IF NOT EXISTS compliance_assertions (
id VARCHAR(36) PRIMARY KEY DEFAULT gen_random_uuid()::text,
tenant_id VARCHAR(36),
entity_type VARCHAR(50) NOT NULL,
entity_id VARCHAR(36) NOT NULL,
sentence_text TEXT NOT NULL,
sentence_index INTEGER NOT NULL DEFAULT 0,
assertion_type VARCHAR(20) NOT NULL DEFAULT 'assertion',
evidence_ids JSONB DEFAULT '[]'::jsonb,
confidence FLOAT DEFAULT 0.0,
normative_tier VARCHAR(20),
verified_by VARCHAR(200),
verified_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS ix_assertion_entity ON compliance_assertions (entity_type, entity_id);
CREATE INDEX IF NOT EXISTS ix_assertion_type ON compliance_assertions (assertion_type);
CREATE INDEX IF NOT EXISTS ix_assertion_tenant ON compliance_assertions (tenant_id);
-- 1B. Four-Eyes columns on Evidence
ALTER TABLE compliance_evidence
ADD COLUMN IF NOT EXISTS approval_status VARCHAR(30) DEFAULT 'none',
ADD COLUMN IF NOT EXISTS first_reviewer VARCHAR(200),
ADD COLUMN IF NOT EXISTS first_reviewed_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS second_reviewer VARCHAR(200),
ADD COLUMN IF NOT EXISTS second_reviewed_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS requires_four_eyes BOOLEAN DEFAULT FALSE;
CREATE INDEX IF NOT EXISTS ix_evidence_approval_status ON compliance_evidence (approval_status);
-- 1C. Audit-Trail performance index
CREATE INDEX IF NOT EXISTS ix_audit_trail_entity_action
ON compliance_audit_trail (entity_type, action, performed_at);
@@ -0,0 +1,42 @@
-- Migration 078: Batch Dedup — Schema extensions for 85k→~18-25k reduction
-- Adds merged_into_uuid tracking, performance indexes for batch dedup,
-- and extends link_type CHECK to include 'cross_regulation'.
BEGIN;
-- =============================================================================
-- 1. merged_into_uuid: Track which master a duplicate was merged into
-- =============================================================================
ALTER TABLE canonical_controls
ADD COLUMN IF NOT EXISTS merged_into_uuid UUID REFERENCES canonical_controls(id);
CREATE INDEX IF NOT EXISTS idx_cc_merged_into
ON canonical_controls(merged_into_uuid) WHERE merged_into_uuid IS NOT NULL;
-- =============================================================================
-- 2. Performance indexes for batch dedup queries
-- =============================================================================
-- Index on merge_group_hint inside generation_metadata (for sub-grouping)
CREATE INDEX IF NOT EXISTS idx_cc_merge_group_hint
ON canonical_controls ((generation_metadata->>'merge_group_hint'))
WHERE decomposition_method = 'pass0b';
-- Composite index for pattern-based dedup loading
CREATE INDEX IF NOT EXISTS idx_cc_pattern_dedup
ON canonical_controls (pattern_id, release_state)
WHERE decomposition_method = 'pass0b';
-- =============================================================================
-- 3. Extend link_type CHECK to include 'cross_regulation'
-- =============================================================================
ALTER TABLE control_parent_links
DROP CONSTRAINT IF EXISTS control_parent_links_link_type_check;
ALTER TABLE control_parent_links
ADD CONSTRAINT control_parent_links_link_type_check
CHECK (link_type IN ('decomposition', 'dedup_merge', 'manual', 'crosswalk', 'cross_regulation'));
COMMIT;
@@ -0,0 +1,16 @@
-- Migration 079: Add evidence_type to canonical_controls
-- Classifies HOW a control is evidenced:
-- code = Technical control, verifiable in source code / IaC / CI-CD
-- process = Organizational / governance control, verified via documents / policies
-- hybrid = Both code and process evidence required
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables
WHERE table_schema = 'compliance' AND table_name = 'canonical_controls') THEN
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
evidence_type VARCHAR(20) DEFAULT NULL
CHECK (evidence_type IN ('code', 'process', 'hybrid'));
CREATE INDEX IF NOT EXISTS idx_cc_evidence_type ON canonical_controls(evidence_type);
END IF;
END $$;
@@ -0,0 +1,18 @@
-- V1 Control Enrichment: Cross-reference table for matching
-- Eigenentwicklung (v1, ungrouped, no source) → regulatorische Controls
CREATE TABLE IF NOT EXISTS v1_control_matches (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
v1_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
matched_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
similarity_score NUMERIC(4,3) NOT NULL,
match_rank SMALLINT NOT NULL DEFAULT 1,
matched_source TEXT, -- e.g. "DSGVO (EU) 2016/679"
matched_article TEXT, -- e.g. "Art. 32"
match_method VARCHAR(30) NOT NULL DEFAULT 'embedding',
created_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT uq_v1_match UNIQUE (v1_control_uuid, matched_control_uuid)
);
CREATE INDEX IF NOT EXISTS idx_v1m_v1 ON v1_control_matches(v1_control_uuid);
CREATE INDEX IF NOT EXISTS idx_v1m_matched ON v1_control_matches(matched_control_uuid);
@@ -0,0 +1,11 @@
-- Migration 081: Add 'duplicate' release_state for obligation deduplication
--
-- Allows marking duplicate obligation_candidates as 'duplicate' instead of
-- deleting them, preserving traceability via merged_into_id.
ALTER TABLE obligation_candidates
DROP CONSTRAINT IF EXISTS obligation_candidates_release_state_check;
ALTER TABLE obligation_candidates
ADD CONSTRAINT obligation_candidates_release_state_check
CHECK (release_state IN ('extracted', 'validated', 'rejected', 'composed', 'merged', 'duplicate'));
@@ -0,0 +1,4 @@
-- Widen source_article and source_regulation to TEXT to handle long NIST references
-- e.g. "SC-22 (und weitere redaktionelle Änderungen SC-7, SC-14, SC-17, ...)"
ALTER TABLE control_parent_links ALTER COLUMN source_article TYPE TEXT;
ALTER TABLE control_parent_links ALTER COLUMN source_regulation TYPE TEXT;
@@ -0,0 +1,6 @@
# Optional: Cross-Encoder Re-Ranking (CPU-only PyTorch)
# Install separately: pip install -r requirements-reranker.txt
# Enable at runtime: RERANK_ENABLED=true
--extra-index-url https://download.pytorch.org/whl/cpu
torch
sentence-transformers>=3.0.0
+2
View File
@@ -22,6 +22,8 @@ python-multipart>=0.0.22
# AI / Anthropic (compliance AI assistant)
anthropic==0.75.0
# Re-Ranking: see requirements-reranker.txt (optional, CPU-only PyTorch)
# PDF Generation (GDPR export, audit reports)
weasyprint>=68.0
reportlab==4.2.5
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,562 @@
"""Tests for Anti-Fake-Evidence Phase 1 guardrails.
~45 tests covering:
- Evidence confidence classification
- Evidence truth status classification
- Control status transition state machine
- Multi-dimensional compliance score
- LLM generation audit
- Evidence review endpoint
"""
from datetime import datetime, timedelta
from unittest.mock import MagicMock, patch
from fastapi import FastAPI
from fastapi.testclient import TestClient
from compliance.api.evidence_routes import router as evidence_router
from compliance.api.llm_audit_routes import router as llm_audit_router
from compliance.api.evidence_routes import _classify_confidence, _classify_truth_status
from compliance.services.control_status_machine import validate_transition
from compliance.db.models import (
EvidenceConfidenceEnum,
EvidenceTruthStatusEnum,
ControlStatusEnum,
)
from classroom_engine.database import get_db
# ---------------------------------------------------------------------------
# App setup with mocked DB dependency
# ---------------------------------------------------------------------------
app = FastAPI()
app.include_router(evidence_router)
app.include_router(llm_audit_router, prefix="/compliance")
mock_db = MagicMock()
def override_get_db():
yield mock_db
app.dependency_overrides[get_db] = override_get_db
client = TestClient(app)
EVIDENCE_UUID = "eeeeeeee-aaaa-bbbb-cccc-ffffffffffff"
CONTROL_UUID = "cccccccc-aaaa-bbbb-cccc-dddddddddddd"
NOW = datetime(2026, 3, 23, 12, 0, 0)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_evidence(overrides=None):
e = MagicMock()
e.id = EVIDENCE_UUID
e.control_id = CONTROL_UUID
e.evidence_type = "test_results"
e.title = "Pytest Test Report"
e.description = "All tests passing"
e.artifact_url = "https://ci.example.com/job/123/artifact"
e.artifact_path = None
e.artifact_hash = "abc123def456"
e.file_size_bytes = None
e.mime_type = None
e.status = MagicMock()
e.status.value = "valid"
e.uploaded_by = None
e.source = "ci_pipeline"
e.ci_job_id = "job-123"
e.valid_from = NOW
e.valid_until = NOW + timedelta(days=90)
e.collected_at = NOW
e.created_at = NOW
# Anti-fake-evidence fields
e.confidence_level = EvidenceConfidenceEnum.E3
e.truth_status = EvidenceTruthStatusEnum.OBSERVED
e.generation_mode = None
e.may_be_used_as_evidence = True
e.reviewed_by = None
e.reviewed_at = None
# Phase 2 fields
e.approval_status = "none"
e.first_reviewer = None
e.first_reviewed_at = None
e.second_reviewer = None
e.second_reviewed_at = None
e.requires_four_eyes = False
if overrides:
for k, v in overrides.items():
setattr(e, k, v)
return e
def make_control(overrides=None):
c = MagicMock()
c.id = CONTROL_UUID
c.control_id = "GOV-001"
c.title = "Access Control"
c.status = ControlStatusEnum.PLANNED
if overrides:
for k, v in overrides.items():
setattr(c, k, v)
return c
# ===========================================================================
# 1. TestEvidenceConfidenceClassification
# ===========================================================================
class TestEvidenceConfidenceClassification:
"""Test automatic confidence level classification."""
def test_ci_pipeline_returns_e3(self):
assert _classify_confidence("ci_pipeline") == EvidenceConfidenceEnum.E3
def test_api_with_hash_returns_e3(self):
assert _classify_confidence("api", artifact_hash="sha256:abc") == EvidenceConfidenceEnum.E3
def test_api_without_hash_returns_e3(self):
assert _classify_confidence("api") == EvidenceConfidenceEnum.E3
def test_manual_returns_e1(self):
assert _classify_confidence("manual") == EvidenceConfidenceEnum.E1
def test_upload_returns_e1(self):
assert _classify_confidence("upload") == EvidenceConfidenceEnum.E1
def test_generated_returns_e0(self):
assert _classify_confidence("generated") == EvidenceConfidenceEnum.E0
def test_unknown_source_returns_e1(self):
assert _classify_confidence("some_random_source") == EvidenceConfidenceEnum.E1
def test_none_source_returns_e1(self):
assert _classify_confidence(None) == EvidenceConfidenceEnum.E1
# ===========================================================================
# 2. TestEvidenceTruthStatus
# ===========================================================================
class TestEvidenceTruthStatus:
"""Test automatic truth status classification."""
def test_ci_pipeline_returns_observed(self):
assert _classify_truth_status("ci_pipeline") == EvidenceTruthStatusEnum.OBSERVED
def test_manual_returns_uploaded(self):
assert _classify_truth_status("manual") == EvidenceTruthStatusEnum.UPLOADED
def test_upload_returns_uploaded(self):
assert _classify_truth_status("upload") == EvidenceTruthStatusEnum.UPLOADED
def test_generated_returns_generated(self):
assert _classify_truth_status("generated") == EvidenceTruthStatusEnum.GENERATED
def test_api_returns_observed(self):
assert _classify_truth_status("api") == EvidenceTruthStatusEnum.OBSERVED
def test_none_returns_uploaded(self):
assert _classify_truth_status(None) == EvidenceTruthStatusEnum.UPLOADED
# ===========================================================================
# 3. TestControlStatusTransitions
# ===========================================================================
class TestControlStatusTransitions:
"""Test the control status transition state machine."""
def test_planned_to_in_progress_allowed(self):
allowed, violations = validate_transition("planned", "in_progress")
assert allowed is True
assert violations == []
def test_in_progress_to_pass_without_evidence_blocked(self):
allowed, violations = validate_transition("in_progress", "pass", evidence_list=[])
assert allowed is False
assert len(violations) > 0
assert "pass" in violations[0].lower()
def test_in_progress_to_pass_with_e2_evidence_allowed(self):
e = make_evidence({
"confidence_level": EvidenceConfidenceEnum.E2,
"truth_status": EvidenceTruthStatusEnum.VALIDATED_INTERNAL,
})
allowed, violations = validate_transition("in_progress", "pass", evidence_list=[e])
assert allowed is True
assert violations == []
def test_in_progress_to_pass_with_e1_evidence_blocked(self):
e = make_evidence({
"confidence_level": EvidenceConfidenceEnum.E1,
"truth_status": EvidenceTruthStatusEnum.UPLOADED,
})
allowed, violations = validate_transition("in_progress", "pass", evidence_list=[e])
assert allowed is False
assert "E2" in violations[0]
def test_in_progress_to_partial_with_evidence_allowed(self):
e = make_evidence({"confidence_level": EvidenceConfidenceEnum.E0})
allowed, violations = validate_transition("in_progress", "partial", evidence_list=[e])
assert allowed is True
def test_in_progress_to_partial_without_evidence_blocked(self):
allowed, violations = validate_transition("in_progress", "partial", evidence_list=[])
assert allowed is False
def test_pass_to_fail_always_allowed(self):
allowed, violations = validate_transition("pass", "fail")
assert allowed is True
def test_any_to_na_requires_justification(self):
allowed, violations = validate_transition("in_progress", "n/a", status_justification=None)
assert allowed is False
assert "justification" in violations[0].lower()
def test_any_to_na_with_justification_allowed(self):
allowed, violations = validate_transition("in_progress", "n/a", status_justification="Not applicable for this project")
assert allowed is True
def test_any_to_planned_always_allowed(self):
allowed, violations = validate_transition("pass", "planned")
assert allowed is True
def test_same_status_noop_allowed(self):
allowed, violations = validate_transition("pass", "pass")
assert allowed is True
def test_bypass_for_auto_updater(self):
allowed, violations = validate_transition("in_progress", "pass", evidence_list=[], bypass_for_auto_updater=True)
assert allowed is True
def test_partial_to_pass_needs_e2(self):
e = make_evidence({
"confidence_level": EvidenceConfidenceEnum.E1,
"truth_status": EvidenceTruthStatusEnum.UPLOADED,
})
allowed, violations = validate_transition("partial", "pass", evidence_list=[e])
assert allowed is False
def test_partial_to_pass_with_e3_allowed(self):
e = make_evidence({
"confidence_level": EvidenceConfidenceEnum.E3,
"truth_status": EvidenceTruthStatusEnum.OBSERVED,
})
allowed, violations = validate_transition("partial", "pass", evidence_list=[e])
assert allowed is True
def test_in_progress_to_fail_allowed(self):
allowed, violations = validate_transition("in_progress", "fail")
assert allowed is True
# ===========================================================================
# 4. TestMultiDimensionalScore
# ===========================================================================
class TestMultiDimensionalScore:
"""Test multi-dimensional score calculation."""
def test_score_structure(self):
"""Score result should have all required keys."""
from compliance.db.repository import ControlRepository
repo = ControlRepository(mock_db)
with patch.object(repo, 'get_all', return_value=[]):
result = repo.get_multi_dimensional_score()
assert "requirement_coverage" in result
assert "evidence_strength" in result
assert "validation_quality" in result
assert "evidence_freshness" in result
assert "control_effectiveness" in result
assert "overall_readiness" in result
assert "hard_blocks" in result
def test_empty_controls_returns_zeros(self):
from compliance.db.repository import ControlRepository
repo = ControlRepository(mock_db)
with patch.object(repo, 'get_all', return_value=[]):
result = repo.get_multi_dimensional_score()
assert result["overall_readiness"] == 0.0
assert "Keine Controls" in result["hard_blocks"][0]
def test_hard_blocks_pass_without_evidence(self):
"""Controls on 'pass' without evidence should trigger hard block."""
from compliance.db.repository import ControlRepository
repo = ControlRepository(mock_db)
ctrl = make_control({"status": ControlStatusEnum.PASS})
mock_db.query.return_value.all.return_value = [] # no evidence
mock_db.query.return_value.scalar.return_value = 0
with patch.object(repo, 'get_all', return_value=[ctrl]):
result = repo.get_multi_dimensional_score()
assert any("Evidence" in b or "evidence" in b.lower() for b in result["hard_blocks"])
def test_all_dimensions_are_floats(self):
from compliance.db.repository import ControlRepository
repo = ControlRepository(mock_db)
with patch.object(repo, 'get_all', return_value=[]):
result = repo.get_multi_dimensional_score()
for key in ["requirement_coverage", "evidence_strength", "validation_quality",
"evidence_freshness", "control_effectiveness", "overall_readiness"]:
assert isinstance(result[key], float), f"{key} should be float"
def test_hard_blocks_is_list(self):
from compliance.db.repository import ControlRepository
repo = ControlRepository(mock_db)
with patch.object(repo, 'get_all', return_value=[]):
result = repo.get_multi_dimensional_score()
assert isinstance(result["hard_blocks"], list)
def test_backwards_compatibility_with_old_score(self):
"""get_statistics should still work and return compliance_score."""
from compliance.db.repository import ControlRepository
repo = ControlRepository(mock_db)
mock_db.query.return_value.scalar.return_value = 0
mock_db.query.return_value.group_by.return_value.all.return_value = []
result = repo.get_statistics()
assert "compliance_score" in result
assert "total" in result
# ===========================================================================
# 5. TestForbiddenFormulations
# ===========================================================================
class TestForbiddenFormulations:
"""Test forbidden formulation detection (tested via the validate endpoint context)."""
def test_import_works(self):
"""Verify forbidden pattern check function is importable and callable."""
# This tests the Python-side schema, the actual check is in TypeScript
from compliance.api.schemas import MultiDimensionalScore, StatusTransitionError
score = MultiDimensionalScore()
assert score.overall_readiness == 0.0
err = StatusTransitionError(current_status="planned", requested_status="pass")
assert err.allowed is False
def test_status_transition_error_schema(self):
from compliance.api.schemas import StatusTransitionError
err = StatusTransitionError(
allowed=False,
current_status="in_progress",
requested_status="pass",
violations=["Need E2 evidence"],
)
assert err.violations == ["Need E2 evidence"]
def test_multi_dimensional_score_defaults(self):
from compliance.api.schemas import MultiDimensionalScore
score = MultiDimensionalScore()
assert score.requirement_coverage == 0.0
assert score.hard_blocks == []
def test_multi_dimensional_score_with_data(self):
from compliance.api.schemas import MultiDimensionalScore
score = MultiDimensionalScore(
requirement_coverage=80.0,
evidence_strength=60.0,
validation_quality=40.0,
evidence_freshness=90.0,
control_effectiveness=70.0,
overall_readiness=65.0,
hard_blocks=["3 Controls ohne Evidence"],
)
assert score.overall_readiness == 65.0
assert len(score.hard_blocks) == 1
def test_evidence_response_has_anti_fake_fields(self):
from compliance.api.schemas import EvidenceResponse
fields = EvidenceResponse.model_fields
assert "confidence_level" in fields
assert "truth_status" in fields
assert "generation_mode" in fields
assert "may_be_used_as_evidence" in fields
assert "reviewed_by" in fields
assert "reviewed_at" in fields
# ===========================================================================
# 6. TestLLMGenerationAudit
# ===========================================================================
class TestLLMGenerationAudit:
"""Test LLM generation audit trail."""
def test_create_audit_record(self):
"""POST /compliance/llm-audit should create a record."""
mock_record = MagicMock()
mock_record.id = "audit-001"
mock_record.tenant_id = None
mock_record.entity_type = "document"
mock_record.entity_id = None
mock_record.generation_mode = "draft_assistance"
mock_record.truth_status = EvidenceTruthStatusEnum.GENERATED
mock_record.may_be_used_as_evidence = False
mock_record.llm_model = "qwen2.5vl:32b"
mock_record.llm_provider = "ollama"
mock_record.prompt_hash = None
mock_record.input_summary = "Test input"
mock_record.output_summary = "Test output"
mock_record.extra_metadata = {}
mock_record.created_at = NOW
mock_db.add = MagicMock()
mock_db.commit = MagicMock()
mock_db.refresh = MagicMock(side_effect=lambda r: setattr(r, 'id', 'audit-001'))
# We need to patch the LLMGenerationAuditDB constructor
with patch('compliance.api.llm_audit_routes.LLMGenerationAuditDB', return_value=mock_record):
resp = client.post("/compliance/llm-audit", json={
"entity_type": "document",
"generation_mode": "draft_assistance",
"truth_status": "generated",
"may_be_used_as_evidence": False,
"llm_model": "qwen2.5vl:32b",
"llm_provider": "ollama",
})
assert resp.status_code == 200
data = resp.json()
assert data["entity_type"] == "document"
assert data["truth_status"] == "generated"
assert data["may_be_used_as_evidence"] is False
def test_truth_status_always_generated_for_llm(self):
"""LLM-generated content should always start with truth_status=generated."""
from compliance.db.models import LLMGenerationAuditDB, EvidenceTruthStatusEnum
audit = LLMGenerationAuditDB()
# Default should be GENERATED
assert audit.truth_status is None or audit.truth_status == EvidenceTruthStatusEnum.GENERATED
def test_may_be_used_as_evidence_defaults_false(self):
"""Generated content should NOT be usable as evidence by default."""
from compliance.db.models import LLMGenerationAuditDB
audit = LLMGenerationAuditDB()
assert audit.may_be_used_as_evidence is False or audit.may_be_used_as_evidence is None
def test_list_audit_records(self):
"""GET /compliance/llm-audit should return records."""
mock_query = MagicMock()
mock_query.count.return_value = 0
mock_query.filter.return_value = mock_query
mock_query.order_by.return_value = mock_query
mock_query.offset.return_value = mock_query
mock_query.limit.return_value = mock_query
mock_query.all.return_value = []
mock_db.query.return_value = mock_query
resp = client.get("/compliance/llm-audit")
assert resp.status_code == 200
data = resp.json()
assert "records" in data
assert "total" in data
assert data["total"] == 0
# ===========================================================================
# 7. TestEvidenceReview
# ===========================================================================
class TestEvidenceReview:
"""Test evidence review endpoint."""
def test_review_upgrades_confidence(self):
"""PATCH /evidence/{id}/review should update confidence and set reviewer."""
evidence = make_evidence({
"confidence_level": EvidenceConfidenceEnum.E1,
"truth_status": EvidenceTruthStatusEnum.UPLOADED,
})
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.commit = MagicMock()
mock_db.refresh = MagicMock()
resp = client.patch(f"/evidence/{EVIDENCE_UUID}/review", json={
"confidence_level": "E2",
"truth_status": "validated_internal",
"reviewed_by": "auditor@example.com",
})
assert resp.status_code == 200
# Verify the evidence was updated
assert evidence.confidence_level == EvidenceConfidenceEnum.E2
assert evidence.truth_status == EvidenceTruthStatusEnum.VALIDATED_INTERNAL
assert evidence.reviewed_by == "auditor@example.com"
assert evidence.reviewed_at is not None
def test_review_nonexistent_evidence_returns_404(self):
mock_db.query.return_value.filter.return_value.first.return_value = None
resp = client.patch("/evidence/nonexistent-id/review", json={
"reviewed_by": "someone",
})
assert resp.status_code == 404
def test_review_invalid_confidence_returns_400(self):
evidence = make_evidence()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
resp = client.patch(f"/evidence/{EVIDENCE_UUID}/review", json={
"confidence_level": "INVALID",
"reviewed_by": "someone",
})
assert resp.status_code == 400
# ===========================================================================
# 8. TestControlUpdateIntegration
# ===========================================================================
class TestControlUpdateIntegration:
"""Test that ControlUpdate schema includes status_justification."""
def test_control_update_has_status_justification(self):
from compliance.api.schemas import ControlUpdate
fields = ControlUpdate.model_fields
assert "status_justification" in fields
def test_control_response_has_status_justification(self):
from compliance.api.schemas import ControlResponse
fields = ControlResponse.model_fields
assert "status_justification" in fields
def test_control_status_enum_has_in_progress(self):
assert ControlStatusEnum.IN_PROGRESS.value == "in_progress"
# ===========================================================================
# 9. TestEvidenceEnums
# ===========================================================================
class TestEvidenceEnums:
"""Test the new evidence enums."""
def test_confidence_enum_values(self):
assert EvidenceConfidenceEnum.E0.value == "E0"
assert EvidenceConfidenceEnum.E1.value == "E1"
assert EvidenceConfidenceEnum.E2.value == "E2"
assert EvidenceConfidenceEnum.E3.value == "E3"
assert EvidenceConfidenceEnum.E4.value == "E4"
def test_truth_status_enum_values(self):
assert EvidenceTruthStatusEnum.GENERATED.value == "generated"
assert EvidenceTruthStatusEnum.UPLOADED.value == "uploaded"
assert EvidenceTruthStatusEnum.OBSERVED.value == "observed"
assert EvidenceTruthStatusEnum.VALIDATED_INTERNAL.value == "validated_internal"
assert EvidenceTruthStatusEnum.REJECTED.value == "rejected"
assert EvidenceTruthStatusEnum.PROVIDED_TO_AUDITOR.value == "provided_to_auditor"
assert EvidenceTruthStatusEnum.ACCEPTED_BY_AUDITOR.value == "accepted_by_auditor"
@@ -0,0 +1,528 @@
"""Tests for Anti-Fake-Evidence Phase 2.
~35 tests covering:
- Audit trail extension (evidence review/create logging)
- Assertion engine (extraction, CRUD, verify, summary)
- Four-Eyes review (domain check, first/second review, same-person reject)
- UI badge data (response schema includes new fields)
"""
from datetime import datetime, timedelta
from unittest.mock import MagicMock, patch
from fastapi import FastAPI
from fastapi.testclient import TestClient
from compliance.api.evidence_routes import (
router as evidence_router,
_requires_four_eyes,
_classify_confidence,
_classify_truth_status,
)
from compliance.api.assertion_routes import router as assertion_router
from compliance.services.assertion_engine import extract_assertions, _classify_sentence
from compliance.db.models import (
EvidenceConfidenceEnum,
EvidenceTruthStatusEnum,
ControlStatusEnum,
AssertionDB,
)
from classroom_engine.database import get_db
# ---------------------------------------------------------------------------
# App setup with mocked DB dependency
# ---------------------------------------------------------------------------
app = FastAPI()
app.include_router(evidence_router)
app.include_router(assertion_router)
mock_db = MagicMock()
def override_get_db():
yield mock_db
app.dependency_overrides[get_db] = override_get_db
client = TestClient(app)
EVIDENCE_UUID = "eeee0002-aaaa-bbbb-cccc-ffffffffffff"
CONTROL_UUID = "cccc0002-aaaa-bbbb-cccc-dddddddddddd"
ASSERTION_UUID = "aaaa0002-bbbb-cccc-dddd-eeeeeeeeeeee"
NOW = datetime(2026, 3, 23, 14, 0, 0)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_evidence(overrides=None):
e = MagicMock()
e.id = EVIDENCE_UUID
e.control_id = CONTROL_UUID
e.evidence_type = "test_results"
e.title = "Phase 2 Test Evidence"
e.description = "Testing four-eyes"
e.artifact_url = "https://ci.example.com/artifact"
e.artifact_path = None
e.artifact_hash = "abc123"
e.file_size_bytes = None
e.mime_type = None
e.status = MagicMock()
e.status.value = "valid"
e.uploaded_by = None
e.source = "api"
e.ci_job_id = None
e.valid_from = NOW
e.valid_until = NOW + timedelta(days=90)
e.collected_at = NOW
e.created_at = NOW
e.confidence_level = EvidenceConfidenceEnum.E1
e.truth_status = EvidenceTruthStatusEnum.UPLOADED
e.generation_mode = None
e.may_be_used_as_evidence = True
e.reviewed_by = None
e.reviewed_at = None
# Phase 2 fields
e.approval_status = "none"
e.first_reviewer = None
e.first_reviewed_at = None
e.second_reviewer = None
e.second_reviewed_at = None
e.requires_four_eyes = False
if overrides:
for k, v in overrides.items():
setattr(e, k, v)
return e
def make_assertion(overrides=None):
a = MagicMock()
a.id = ASSERTION_UUID
a.tenant_id = "tenant-001"
a.entity_type = "control"
a.entity_id = CONTROL_UUID
a.sentence_text = "Test assertion sentence"
a.sentence_index = 0
a.assertion_type = "assertion"
a.evidence_ids = []
a.confidence = 0.0
a.normative_tier = "pflicht"
a.verified_by = None
a.verified_at = None
a.created_at = NOW
a.updated_at = NOW
if overrides:
for k, v in overrides.items():
setattr(a, k, v)
return a
# ===========================================================================
# 1. TestAuditTrailExtension
# ===========================================================================
class TestAuditTrailExtension:
"""Test that evidence review and create log audit trail entries."""
def test_review_evidence_logs_audit_trail(self):
evidence = make_evidence()
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.refresh.return_value = None
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"confidence_level": "E2", "reviewed_by": "auditor@test.com"},
)
assert resp.status_code == 200
# db.add should be called for audit trail entries
assert mock_db.add.called
def test_review_evidence_records_old_and_new_confidence(self):
evidence = make_evidence({"confidence_level": EvidenceConfidenceEnum.E1})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.refresh.return_value = None
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"confidence_level": "E3", "reviewed_by": "reviewer@test.com"},
)
assert resp.status_code == 200
def test_review_evidence_records_truth_status_change(self):
evidence = make_evidence({"truth_status": EvidenceTruthStatusEnum.UPLOADED})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.refresh.return_value = None
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"truth_status": "validated_internal", "reviewed_by": "reviewer@test.com"},
)
assert resp.status_code == 200
def test_review_nonexistent_evidence_returns_404(self):
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = None
resp = client.patch(
"/evidence/nonexistent/review",
json={"reviewed_by": "someone"},
)
assert resp.status_code == 404
def test_reject_evidence_logs_audit_trail(self):
evidence = make_evidence()
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.refresh.return_value = None
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/reject",
json={"reviewed_by": "auditor@test.com", "rejection_reason": "Fake evidence"},
)
assert resp.status_code == 200
data = resp.json()
assert data["approval_status"] == "rejected"
def test_reject_nonexistent_evidence_returns_404(self):
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = None
resp = client.patch(
"/evidence/nonexistent/reject",
json={"reviewed_by": "someone"},
)
assert resp.status_code == 404
def test_audit_trail_query_endpoint(self):
mock_db.reset_mock()
trail_entry = MagicMock()
trail_entry.id = "trail-001"
trail_entry.entity_type = "evidence"
trail_entry.entity_id = EVIDENCE_UUID
trail_entry.entity_name = "Test"
trail_entry.action = "review"
trail_entry.field_changed = "confidence_level"
trail_entry.old_value = "E1"
trail_entry.new_value = "E2"
trail_entry.change_summary = None
trail_entry.performed_by = "auditor"
trail_entry.performed_at = NOW
trail_entry.checksum = "abc"
mock_db.query.return_value.filter.return_value.filter.return_value.order_by.return_value.limit.return_value.all.return_value = [trail_entry]
resp = client.get(f"/audit-trail?entity_type=evidence&entity_id={EVIDENCE_UUID}")
assert resp.status_code == 200
data = resp.json()
assert data["total"] >= 1
def test_audit_trail_checksum_present(self):
"""Audit trail entries should have a checksum for integrity."""
from compliance.api.audit_trail_utils import create_signature
sig = create_signature("evidence|123|review|user@test.com")
assert len(sig) == 64 # SHA-256 hex digest
# ===========================================================================
# 2. TestAssertionEngine
# ===========================================================================
class TestAssertionEngine:
"""Test assertion extraction and classification."""
def test_pflicht_sentence_classified_as_assertion(self):
result = _classify_sentence("Die Organisation muss ein ISMS implementieren.")
assert result == ("assertion", "pflicht")
def test_empfehlung_sentence_classified(self):
result = _classify_sentence("Die Organisation sollte regelmäßige Audits durchführen.")
assert result == ("assertion", "empfehlung")
def test_kann_sentence_classified(self):
result = _classify_sentence("Optional kann ein externes Audit durchgeführt werden.")
assert result == ("assertion", "kann")
def test_rationale_sentence_classified(self):
result = _classify_sentence("Dies ist erforderlich, weil Datenverlust schwere Folgen hat.")
assert result == ("rationale", None)
def test_fact_sentence_with_evidence_keyword(self):
result = _classify_sentence("Das Zertifikat wurde am 15.03.2026 ausgestellt.")
assert result == ("fact", None)
def test_extract_assertions_splits_sentences(self):
text = "Die Organisation muss Daten schützen. Sie sollte regelmäßig prüfen."
results = extract_assertions(text, "control", "ctrl-001")
assert len(results) == 2
assert results[0]["assertion_type"] == "assertion"
assert results[0]["normative_tier"] == "pflicht"
assert results[1]["normative_tier"] == "empfehlung"
def test_extract_assertions_empty_text(self):
results = extract_assertions("", "control", "ctrl-001")
assert results == []
def test_extract_assertions_single_sentence(self):
results = extract_assertions("Der Betreiber muss ein Audit durchführen.", "control", "ctrl-001")
assert len(results) == 1
assert results[0]["normative_tier"] == "pflicht"
def test_mixed_text_with_rationale(self):
text = "Die Organisation muss ein ISMS implementieren. Dies ist notwendig, weil Compliance gefordert ist."
results = extract_assertions(text, "control", "ctrl-001")
assert len(results) == 2
types = [r["assertion_type"] for r in results]
assert "assertion" in types
assert "rationale" in types
def test_assertion_crud_create(self):
mock_db.reset_mock()
mock_db.refresh.return_value = None
# Mock the added object to return proper values
def side_effect_add(obj):
obj.id = ASSERTION_UUID
obj.created_at = NOW
obj.updated_at = NOW
obj.sentence_index = 0
obj.confidence = 0.0
mock_db.add.side_effect = side_effect_add
resp = client.post(
"/assertions?tenant_id=tenant-001",
json={
"entity_type": "control",
"entity_id": CONTROL_UUID,
"sentence_text": "Die Organisation muss ein ISMS implementieren.",
"assertion_type": "assertion",
"normative_tier": "pflicht",
},
)
assert resp.status_code == 200
def test_assertion_verify_endpoint(self):
a = make_assertion()
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = a
mock_db.refresh.return_value = None
resp = client.post(f"/assertions/{ASSERTION_UUID}/verify?verified_by=auditor@test.com")
assert resp.status_code == 200
assert a.assertion_type == "fact"
assert a.verified_by == "auditor@test.com"
def test_assertion_summary(self):
mock_db.reset_mock()
a1 = make_assertion({"assertion_type": "assertion", "verified_by": None})
a2 = make_assertion({"assertion_type": "fact", "verified_by": "user"})
a3 = make_assertion({"assertion_type": "rationale", "verified_by": None})
mock_db.query.return_value.filter.return_value.filter.return_value.filter.return_value.all.return_value = [a1, a2, a3]
# Direct .all() for no-filter case
mock_db.query.return_value.all.return_value = [a1, a2, a3]
resp = client.get("/assertions/summary")
assert resp.status_code == 200
data = resp.json()
assert data["total_assertions"] == 3
assert data["total_facts"] == 1
assert data["total_rationale"] == 1
assert data["unverified_count"] == 1
# ===========================================================================
# 3. TestFourEyesReview
# ===========================================================================
class TestFourEyesReview:
"""Test Four-Eyes review process."""
def test_gov_domain_requires_four_eyes(self):
assert _requires_four_eyes("gov") is True
def test_priv_domain_requires_four_eyes(self):
assert _requires_four_eyes("priv") is True
def test_ops_domain_does_not_require_four_eyes(self):
assert _requires_four_eyes("ops") is False
def test_sdlc_domain_does_not_require_four_eyes(self):
assert _requires_four_eyes("sdlc") is False
def test_first_review_sets_first_approved(self):
evidence = make_evidence({
"requires_four_eyes": True,
"approval_status": "pending_first",
})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.refresh.return_value = None
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"reviewed_by": "reviewer1@test.com"},
)
assert resp.status_code == 200
assert evidence.first_reviewer == "reviewer1@test.com"
assert evidence.approval_status == "first_approved"
def test_second_review_different_person_approves(self):
evidence = make_evidence({
"requires_four_eyes": True,
"approval_status": "first_approved",
"first_reviewer": "reviewer1@test.com",
})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.refresh.return_value = None
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"reviewed_by": "reviewer2@test.com"},
)
assert resp.status_code == 200
assert evidence.second_reviewer == "reviewer2@test.com"
assert evidence.approval_status == "approved"
def test_same_person_second_review_rejected(self):
evidence = make_evidence({
"requires_four_eyes": True,
"approval_status": "first_approved",
"first_reviewer": "reviewer1@test.com",
})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"reviewed_by": "reviewer1@test.com"},
)
assert resp.status_code == 400
assert "different" in resp.json()["detail"].lower()
def test_already_approved_blocked(self):
evidence = make_evidence({
"requires_four_eyes": True,
"approval_status": "approved",
})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"reviewed_by": "reviewer3@test.com"},
)
assert resp.status_code == 400
assert "already" in resp.json()["detail"].lower()
def test_rejected_evidence_cannot_be_reviewed(self):
evidence = make_evidence({
"requires_four_eyes": True,
"approval_status": "rejected",
})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"reviewed_by": "reviewer@test.com"},
)
assert resp.status_code == 400
def test_reject_endpoint(self):
evidence = make_evidence({"requires_four_eyes": True})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.refresh.return_value = None
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/reject",
json={"reviewed_by": "auditor@test.com", "rejection_reason": "Not authentic"},
)
assert resp.status_code == 200
assert evidence.approval_status == "rejected"
# ===========================================================================
# 4. TestUIBadgeData
# ===========================================================================
class TestUIBadgeData:
"""Test that evidence response includes all Phase 2 fields."""
def test_evidence_response_includes_approval_status(self):
evidence = make_evidence({
"approval_status": "first_approved",
"first_reviewer": "reviewer1@test.com",
"first_reviewed_at": NOW,
"requires_four_eyes": True,
})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
mock_db.refresh.return_value = None
resp = client.patch(
f"/evidence/{EVIDENCE_UUID}/review",
json={"reviewed_by": "reviewer2@test.com"},
)
assert resp.status_code == 200
data = resp.json()
assert "approval_status" in data
assert "requires_four_eyes" in data
assert data["requires_four_eyes"] is True
def test_evidence_response_includes_four_eyes_fields(self):
evidence = make_evidence({
"requires_four_eyes": True,
"approval_status": "approved",
"first_reviewer": "r1@test.com",
"first_reviewed_at": NOW,
"second_reviewer": "r2@test.com",
"second_reviewed_at": NOW,
})
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = evidence
# Use list endpoint
mock_db.query.return_value.filter.return_value.all.return_value = [evidence]
mock_db.query.return_value.all.return_value = [evidence]
# Direct test via _build_evidence_response
from compliance.api.evidence_routes import _build_evidence_response
resp = _build_evidence_response(evidence)
assert resp.approval_status == "approved"
assert resp.first_reviewer == "r1@test.com"
assert resp.second_reviewer == "r2@test.com"
assert resp.requires_four_eyes is True
def test_assertion_response_schema(self):
a = make_assertion()
mock_db.reset_mock()
mock_db.query.return_value.filter.return_value.first.return_value = a
resp = client.get(f"/assertions/{ASSERTION_UUID}")
assert resp.status_code == 200
data = resp.json()
assert "assertion_type" in data
assert "normative_tier" in data
assert "evidence_ids" in data
assert "verified_by" in data
def test_evidence_response_includes_confidence_and_truth(self):
evidence = make_evidence({
"confidence_level": EvidenceConfidenceEnum.E3,
"truth_status": EvidenceTruthStatusEnum.OBSERVED,
})
from compliance.api.evidence_routes import _build_evidence_response
resp = _build_evidence_response(evidence)
assert resp.confidence_level == "E3"
assert resp.truth_status == "observed"
def test_evidence_response_none_four_eyes_fields_default(self):
evidence = make_evidence()
from compliance.api.evidence_routes import _build_evidence_response
resp = _build_evidence_response(evidence)
assert resp.approval_status == "none"
assert resp.requires_four_eyes is False
assert resp.first_reviewer is None
@@ -0,0 +1,191 @@
"""Tests for Anti-Fake-Evidence Phase 3: Enforcement.
~8 tests covering:
- Evidence distribution endpoint (confidence counts, four-eyes pending)
- Dashboard multi-score presence
"""
from datetime import datetime, timedelta
from unittest.mock import MagicMock, patch, PropertyMock
from fastapi import FastAPI
from fastapi.testclient import TestClient
from compliance.api.dashboard_routes import router as dashboard_router
from compliance.db.models import EvidenceConfidenceEnum, EvidenceTruthStatusEnum
from classroom_engine.database import get_db
# ---------------------------------------------------------------------------
# App setup with mocked DB dependency
# ---------------------------------------------------------------------------
app = FastAPI()
app.include_router(dashboard_router)
mock_db = MagicMock()
def override_get_db():
yield mock_db
app.dependency_overrides[get_db] = override_get_db
client = TestClient(app)
NOW = datetime(2026, 3, 23, 14, 0, 0)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_evidence(confidence="E1", requires_four_eyes=False, approval_status="none"):
e = MagicMock()
e.confidence_level = MagicMock()
e.confidence_level.value = confidence
e.requires_four_eyes = requires_four_eyes
e.approval_status = approval_status
return e
# ===========================================================================
# 1. TestEvidenceDistributionEndpoint
# ===========================================================================
class TestEvidenceDistributionEndpoint:
"""Test GET /dashboard/evidence-distribution endpoint."""
def _setup_evidence(self, evidence_list):
"""Configure mock DB to return evidence list via EvidenceRepository."""
mock_db.reset_mock()
# EvidenceRepository(db).get_all() internally does db.query(...).all()
# We patch the EvidenceRepository class to return our list
return evidence_list
@patch("compliance.api.dashboard_routes.EvidenceRepository")
def test_empty_db_returns_zero_counts(self, mock_repo_cls):
mock_repo = MagicMock()
mock_repo.get_all.return_value = []
mock_repo_cls.return_value = mock_repo
resp = client.get("/dashboard/evidence-distribution")
assert resp.status_code == 200
data = resp.json()
assert data["total"] == 0
assert data["four_eyes_pending"] == 0
assert data["by_confidence"] == {"E0": 0, "E1": 0, "E2": 0, "E3": 0, "E4": 0}
@patch("compliance.api.dashboard_routes.EvidenceRepository")
def test_counts_by_confidence_level(self, mock_repo_cls):
evidence = [
make_evidence("E0"),
make_evidence("E1"),
make_evidence("E1"),
make_evidence("E2"),
make_evidence("E3"),
make_evidence("E3"),
make_evidence("E3"),
make_evidence("E4"),
]
mock_repo = MagicMock()
mock_repo.get_all.return_value = evidence
mock_repo_cls.return_value = mock_repo
resp = client.get("/dashboard/evidence-distribution")
assert resp.status_code == 200
data = resp.json()
assert data["total"] == 8
assert data["by_confidence"]["E0"] == 1
assert data["by_confidence"]["E1"] == 2
assert data["by_confidence"]["E2"] == 1
assert data["by_confidence"]["E3"] == 3
assert data["by_confidence"]["E4"] == 1
@patch("compliance.api.dashboard_routes.EvidenceRepository")
def test_four_eyes_pending_count(self, mock_repo_cls):
evidence = [
make_evidence("E1", requires_four_eyes=True, approval_status="pending_first"),
make_evidence("E2", requires_four_eyes=True, approval_status="first_approved"),
make_evidence("E2", requires_four_eyes=True, approval_status="approved"),
make_evidence("E1", requires_four_eyes=True, approval_status="rejected"),
make_evidence("E1", requires_four_eyes=False, approval_status="none"),
]
mock_repo = MagicMock()
mock_repo.get_all.return_value = evidence
mock_repo_cls.return_value = mock_repo
resp = client.get("/dashboard/evidence-distribution")
assert resp.status_code == 200
data = resp.json()
# pending_first and first_approved are pending; approved and rejected are not
assert data["four_eyes_pending"] == 2
assert data["total"] == 5
@patch("compliance.api.dashboard_routes.EvidenceRepository")
def test_null_confidence_defaults_to_e1(self, mock_repo_cls):
e = MagicMock()
e.confidence_level = None
e.requires_four_eyes = False
e.approval_status = "none"
mock_repo = MagicMock()
mock_repo.get_all.return_value = [e]
mock_repo_cls.return_value = mock_repo
resp = client.get("/dashboard/evidence-distribution")
assert resp.status_code == 200
data = resp.json()
assert data["by_confidence"]["E1"] == 1
assert data["total"] == 1
@patch("compliance.api.dashboard_routes.EvidenceRepository")
def test_all_four_eyes_approved_zero_pending(self, mock_repo_cls):
evidence = [
make_evidence("E2", requires_four_eyes=True, approval_status="approved"),
make_evidence("E3", requires_four_eyes=True, approval_status="approved"),
]
mock_repo = MagicMock()
mock_repo.get_all.return_value = evidence
mock_repo_cls.return_value = mock_repo
resp = client.get("/dashboard/evidence-distribution")
assert resp.status_code == 200
data = resp.json()
assert data["four_eyes_pending"] == 0
# ===========================================================================
# 2. TestDashboardMultiScore
# ===========================================================================
class TestDashboardMultiScore:
"""Test that dashboard response includes multi_score."""
def test_dashboard_response_schema_includes_multi_score(self):
"""DashboardResponse schema must include the multi_score field."""
from compliance.api.schemas import DashboardResponse
fields = DashboardResponse.model_fields
assert "multi_score" in fields, "DashboardResponse must have multi_score field"
def test_multi_score_schema_has_required_fields(self):
"""MultiDimensionalScore schema should have all 7 fields."""
from compliance.api.schemas import MultiDimensionalScore
fields = MultiDimensionalScore.model_fields
required = [
"requirement_coverage",
"evidence_strength",
"validation_quality",
"evidence_freshness",
"control_effectiveness",
"overall_readiness",
"hard_blocks",
]
for field in required:
assert field in fields, f"Missing field: {field}"
def test_multi_score_default_values(self):
"""MultiDimensionalScore defaults should be sensible."""
from compliance.api.schemas import MultiDimensionalScore
score = MultiDimensionalScore()
assert score.overall_readiness == 0.0
assert score.hard_blocks == []
assert score.requirement_coverage == 0.0
@@ -0,0 +1,277 @@
"""Tests for Anti-Fake-Evidence Phase 4a: Traceability Matrix.
6 tests covering:
- Empty DB returns empty controls + zero summary
- Nested structure: Control Evidence Assertions
- Assertions appear under correct evidence
- Coverage flags computed correctly
- Control without evidence has correct coverage
- Summary counts match
"""
from datetime import datetime
from unittest.mock import MagicMock, patch
from fastapi import FastAPI
from fastapi.testclient import TestClient
from compliance.api.dashboard_routes import router as dashboard_router
from classroom_engine.database import get_db
# ---------------------------------------------------------------------------
# App setup with mocked DB dependency
# ---------------------------------------------------------------------------
app = FastAPI()
app.include_router(dashboard_router)
mock_db = MagicMock()
def override_get_db():
yield mock_db
app.dependency_overrides[get_db] = override_get_db
client = TestClient(app)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_control(id="c1", control_id="CTRL-001", title="Test Control", status="pass", domain="gov"):
ctrl = MagicMock()
ctrl.id = id
ctrl.control_id = control_id
ctrl.title = title
ctrl.status = MagicMock()
ctrl.status.value = status
ctrl.domain = MagicMock()
ctrl.domain.value = domain
return ctrl
def make_evidence(id="e1", control_id="c1", title="Evidence 1", evidence_type="scan_report",
confidence="E2", status="valid"):
e = MagicMock()
e.id = id
e.control_id = control_id
e.title = title
e.evidence_type = evidence_type
e.confidence_level = MagicMock()
e.confidence_level.value = confidence
e.status = MagicMock()
e.status.value = status
return e
def make_assertion(id="a1", entity_id="e1", sentence_text="System encrypts data at rest.",
assertion_type="assertion", confidence=0.85, verified_by=None):
a = MagicMock()
a.id = id
a.entity_id = entity_id
a.sentence_text = sentence_text
a.assertion_type = assertion_type
a.confidence = confidence
a.verified_by = verified_by
return a
# ===========================================================================
# Tests
# ===========================================================================
class TestTraceabilityMatrix:
"""Test GET /dashboard/traceability-matrix endpoint."""
@patch("compliance.api.dashboard_routes.EvidenceRepository")
@patch("compliance.api.dashboard_routes.ControlRepository")
def test_empty_db_returns_empty_matrix(self, mock_ctrl_cls, mock_ev_cls):
"""Empty DB should return zero controls and zero summary counts."""
mock_ctrl = MagicMock()
mock_ctrl.get_all.return_value = []
mock_ctrl_cls.return_value = mock_ctrl
mock_ev = MagicMock()
mock_ev.get_all.return_value = []
mock_ev_cls.return_value = mock_ev
# Mock db.query(AssertionDB).filter(...).all()
mock_db.reset_mock()
mock_query = MagicMock()
mock_query.filter.return_value.all.return_value = []
mock_db.query.return_value = mock_query
resp = client.get("/dashboard/traceability-matrix")
assert resp.status_code == 200
data = resp.json()
assert data["controls"] == []
assert data["summary"]["total_controls"] == 0
assert data["summary"]["covered_controls"] == 0
assert data["summary"]["fully_verified"] == 0
assert data["summary"]["uncovered_controls"] == 0
@patch("compliance.api.dashboard_routes.EvidenceRepository")
@patch("compliance.api.dashboard_routes.ControlRepository")
def test_nested_structure(self, mock_ctrl_cls, mock_ev_cls):
"""Control with evidence and assertions should return nested structure."""
ctrl = make_control(id="c1", control_id="PRIV-001", title="Privacy Control")
ev = make_evidence(id="e1", control_id="c1", confidence="E3")
assertion = make_assertion(id="a1", entity_id="e1", verified_by="auditor@example.com")
mock_ctrl = MagicMock()
mock_ctrl.get_all.return_value = [ctrl]
mock_ctrl_cls.return_value = mock_ctrl
mock_ev = MagicMock()
mock_ev.get_all.return_value = [ev]
mock_ev_cls.return_value = mock_ev
mock_db.reset_mock()
mock_query = MagicMock()
mock_query.filter.return_value.all.return_value = [assertion]
mock_db.query.return_value = mock_query
resp = client.get("/dashboard/traceability-matrix")
assert resp.status_code == 200
data = resp.json()
assert len(data["controls"]) == 1
c = data["controls"][0]
assert c["control_id"] == "PRIV-001"
assert len(c["evidence"]) == 1
assert c["evidence"][0]["confidence_level"] == "E3"
assert len(c["evidence"][0]["assertions"]) == 1
assert c["evidence"][0]["assertions"][0]["verified"] is True
@patch("compliance.api.dashboard_routes.EvidenceRepository")
@patch("compliance.api.dashboard_routes.ControlRepository")
def test_assertions_grouped_under_correct_evidence(self, mock_ctrl_cls, mock_ev_cls):
"""Assertions should only appear under the evidence they reference."""
ctrl = make_control(id="c1")
ev1 = make_evidence(id="e1", control_id="c1", title="Evidence A")
ev2 = make_evidence(id="e2", control_id="c1", title="Evidence B")
a1 = make_assertion(id="a1", entity_id="e1", sentence_text="Assertion for E1")
a2 = make_assertion(id="a2", entity_id="e2", sentence_text="Assertion for E2")
a3 = make_assertion(id="a3", entity_id="e2", sentence_text="Second assertion for E2")
mock_ctrl = MagicMock()
mock_ctrl.get_all.return_value = [ctrl]
mock_ctrl_cls.return_value = mock_ctrl
mock_ev = MagicMock()
mock_ev.get_all.return_value = [ev1, ev2]
mock_ev_cls.return_value = mock_ev
mock_db.reset_mock()
mock_query = MagicMock()
mock_query.filter.return_value.all.return_value = [a1, a2, a3]
mock_db.query.return_value = mock_query
resp = client.get("/dashboard/traceability-matrix")
assert resp.status_code == 200
data = resp.json()
c = data["controls"][0]
ev1_data = next(e for e in c["evidence"] if e["id"] == "e1")
ev2_data = next(e for e in c["evidence"] if e["id"] == "e2")
assert len(ev1_data["assertions"]) == 1
assert len(ev2_data["assertions"]) == 2
@patch("compliance.api.dashboard_routes.EvidenceRepository")
@patch("compliance.api.dashboard_routes.ControlRepository")
def test_coverage_flags_correct(self, mock_ctrl_cls, mock_ev_cls):
"""Coverage flags should reflect evidence, assertions, and verification state."""
ctrl = make_control(id="c1")
ev = make_evidence(id="e1", control_id="c1", confidence="E2")
# One verified, one not
a1 = make_assertion(id="a1", entity_id="e1", verified_by="alice")
a2 = make_assertion(id="a2", entity_id="e1", verified_by=None)
mock_ctrl = MagicMock()
mock_ctrl.get_all.return_value = [ctrl]
mock_ctrl_cls.return_value = mock_ctrl
mock_ev = MagicMock()
mock_ev.get_all.return_value = [ev]
mock_ev_cls.return_value = mock_ev
mock_db.reset_mock()
mock_query = MagicMock()
mock_query.filter.return_value.all.return_value = [a1, a2]
mock_db.query.return_value = mock_query
resp = client.get("/dashboard/traceability-matrix")
assert resp.status_code == 200
cov = resp.json()["controls"][0]["coverage"]
assert cov["has_evidence"] is True
assert cov["has_assertions"] is True
assert cov["all_assertions_verified"] is False # a2 not verified
assert cov["min_confidence_level"] == "E2"
@patch("compliance.api.dashboard_routes.EvidenceRepository")
@patch("compliance.api.dashboard_routes.ControlRepository")
def test_coverage_without_evidence(self, mock_ctrl_cls, mock_ev_cls):
"""Control with no evidence should have all coverage flags False/None."""
ctrl = make_control(id="c1")
mock_ctrl = MagicMock()
mock_ctrl.get_all.return_value = [ctrl]
mock_ctrl_cls.return_value = mock_ctrl
mock_ev = MagicMock()
mock_ev.get_all.return_value = []
mock_ev_cls.return_value = mock_ev
mock_db.reset_mock()
mock_query = MagicMock()
mock_query.filter.return_value.all.return_value = []
mock_db.query.return_value = mock_query
resp = client.get("/dashboard/traceability-matrix")
assert resp.status_code == 200
cov = resp.json()["controls"][0]["coverage"]
assert cov["has_evidence"] is False
assert cov["has_assertions"] is False
assert cov["all_assertions_verified"] is False
assert cov["min_confidence_level"] is None
@patch("compliance.api.dashboard_routes.EvidenceRepository")
@patch("compliance.api.dashboard_routes.ControlRepository")
def test_summary_counts(self, mock_ctrl_cls, mock_ev_cls):
"""Summary should count total, covered, fully verified, and uncovered controls."""
# c1: has evidence + verified assertions → fully verified
# c2: has evidence but no assertions → covered, not fully verified
# c3: no evidence → uncovered
c1 = make_control(id="c1", control_id="C-001")
c2 = make_control(id="c2", control_id="C-002")
c3 = make_control(id="c3", control_id="C-003")
ev1 = make_evidence(id="e1", control_id="c1", confidence="E3")
ev2 = make_evidence(id="e2", control_id="c2", confidence="E1")
a1 = make_assertion(id="a1", entity_id="e1", verified_by="auditor")
mock_ctrl = MagicMock()
mock_ctrl.get_all.return_value = [c1, c2, c3]
mock_ctrl_cls.return_value = mock_ctrl
mock_ev = MagicMock()
mock_ev.get_all.return_value = [ev1, ev2]
mock_ev_cls.return_value = mock_ev
mock_db.reset_mock()
mock_query = MagicMock()
mock_query.filter.return_value.all.return_value = [a1]
mock_db.query.return_value = mock_query
resp = client.get("/dashboard/traceability-matrix")
assert resp.status_code == 200
summary = resp.json()["summary"]
assert summary["total_controls"] == 3
assert summary["covered_controls"] == 2
assert summary["fully_verified"] == 1
assert summary["uncovered_controls"] == 1
@@ -0,0 +1,440 @@
"""Tests for Batch Dedup Runner (batch_dedup_runner.py).
Covers:
- quality_score(): Richness ranking
- BatchDedupRunner._sub_group_by_merge_hint(): Composite key grouping
- Master selection (highest quality score wins)
- Duplicate linking (mark + parent-link transfer)
- Dry run mode (no DB changes)
- Cross-group pass
- Progress reporting / stats
"""
import json
import pytest
from unittest.mock import MagicMock, AsyncMock, patch, call
from compliance.services.batch_dedup_runner import (
quality_score,
BatchDedupRunner,
DEDUP_COLLECTION,
)
# ---------------------------------------------------------------------------
# quality_score TESTS
# ---------------------------------------------------------------------------
class TestQualityScore:
"""Quality scoring: richer controls should score higher."""
def test_empty_control(self):
score = quality_score({})
assert score == 0.0
def test_requirements_weight(self):
score = quality_score({"requirements": json.dumps(["r1", "r2", "r3"])})
assert score == pytest.approx(6.0) # 3 * 2.0
def test_test_procedure_weight(self):
score = quality_score({"test_procedure": json.dumps(["t1", "t2"])})
assert score == pytest.approx(3.0) # 2 * 1.5
def test_evidence_weight(self):
score = quality_score({"evidence": json.dumps(["e1"])})
assert score == pytest.approx(1.0) # 1 * 1.0
def test_objective_weight_capped(self):
short = quality_score({"objective": "x" * 100})
long = quality_score({"objective": "x" * 1000})
assert short == pytest.approx(0.5) # 100/200
assert long == pytest.approx(3.0) # capped at 3.0
def test_combined_score(self):
control = {
"requirements": json.dumps(["r1", "r2"]),
"test_procedure": json.dumps(["t1"]),
"evidence": json.dumps(["e1", "e2"]),
"objective": "x" * 400,
}
# 2*2 + 1*1.5 + 2*1.0 + min(400/200, 3) = 4 + 1.5 + 2 + 2 = 9.5
assert quality_score(control) == pytest.approx(9.5)
def test_json_string_vs_list(self):
"""Both JSON strings and already-parsed lists should work."""
a = quality_score({"requirements": json.dumps(["r1", "r2"])})
b = quality_score({"requirements": '["r1", "r2"]'})
assert a == b
def test_null_fields(self):
"""None values should not crash."""
score = quality_score({
"requirements": None,
"test_procedure": None,
"evidence": None,
"objective": None,
})
assert score == 0.0
def test_ranking_order(self):
"""Rich control should rank above sparse control."""
rich = {
"requirements": json.dumps(["r1", "r2", "r3"]),
"test_procedure": json.dumps(["t1", "t2"]),
"evidence": json.dumps(["e1"]),
"objective": "A comprehensive objective for this control.",
}
sparse = {
"requirements": json.dumps(["r1"]),
"objective": "Short",
}
assert quality_score(rich) > quality_score(sparse)
# ---------------------------------------------------------------------------
# Sub-grouping TESTS
# ---------------------------------------------------------------------------
class TestSubGrouping:
def _make_runner(self):
db = MagicMock()
return BatchDedupRunner(db=db)
def test_groups_by_merge_hint(self):
runner = self._make_runner()
controls = [
{"uuid": "a", "merge_group_hint": "implement:mfa:none"},
{"uuid": "b", "merge_group_hint": "implement:mfa:none"},
{"uuid": "c", "merge_group_hint": "test:firewall:periodic"},
]
groups = runner._sub_group_by_merge_hint(controls)
assert len(groups) == 2
assert len(groups["implement:mfa:none"]) == 2
assert len(groups["test:firewall:periodic"]) == 1
def test_empty_hint_gets_own_group(self):
runner = self._make_runner()
controls = [
{"uuid": "x", "merge_group_hint": ""},
{"uuid": "y", "merge_group_hint": ""},
]
groups = runner._sub_group_by_merge_hint(controls)
# Each empty-hint control gets its own group
assert len(groups) == 2
def test_single_control_single_group(self):
runner = self._make_runner()
controls = [
{"uuid": "a", "merge_group_hint": "implement:mfa:none"},
]
groups = runner._sub_group_by_merge_hint(controls)
assert len(groups) == 1
# ---------------------------------------------------------------------------
# Master Selection TESTS
# ---------------------------------------------------------------------------
class TestMasterSelection:
"""Best quality score should become master."""
@pytest.mark.asyncio
async def test_highest_score_is_master(self):
"""In a group, the control with highest quality_score is master."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
# Mock parent link transfer query
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none",
title="MFA implementiert")
rich = _make_control("r1", reqs=5, tests=3, evidence=2,
hint="implement:mfa:none", title="MFA implementiert")
medium = _make_control("m1", reqs=2, tests=1,
hint="implement:mfa:none", title="MFA implementiert")
controls = [sparse, medium, rich]
# All have same title → all should be title-identical linked
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
# Rich should be master (1 master), others linked (2 linked)
assert runner.stats["masters"] == 1
assert runner.stats["linked"] == 2
assert runner.stats["skipped_title_identical"] == 2
# ---------------------------------------------------------------------------
# Dry Run TESTS
# ---------------------------------------------------------------------------
class TestDryRun:
"""Dry run should compute stats but NOT modify DB."""
@pytest.mark.asyncio
async def test_dry_run_no_db_writes(self):
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
runner = BatchDedupRunner(db=db)
controls = [
_make_control("a", reqs=3, hint="implement:mfa:none", title="MFA impl"),
_make_control("b", reqs=1, hint="implement:mfa:none", title="MFA impl"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
assert runner.stats["masters"] == 1
assert runner.stats["linked"] == 1
# No commit for dedup operations in dry_run
db.commit.assert_not_called()
# ---------------------------------------------------------------------------
# Parent Link Transfer TESTS
# ---------------------------------------------------------------------------
class TestParentLinkTransfer:
"""Parent links should migrate from duplicate to master."""
def test_transfer_parent_links(self):
db = MagicMock()
# Mock: duplicate has 2 parent links
db.execute.return_value.fetchall.return_value = [
("parent-1", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
("parent-2", "decomposition", 0.9, "NIS2", "Art. 21", "obl-2"),
]
runner = BatchDedupRunner(db=db)
count = runner._transfer_parent_links("master-uuid", "dup-uuid")
assert count == 2
# Two INSERT calls for the transferred links
assert db.execute.call_count == 3 # 1 SELECT + 2 INSERTs
def test_transfer_skips_self_reference(self):
db = MagicMock()
# Parent link points to master itself → should be skipped
db.execute.return_value.fetchall.return_value = [
("master-uuid", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
]
runner = BatchDedupRunner(db=db)
count = runner._transfer_parent_links("master-uuid", "dup-uuid")
assert count == 0
# ---------------------------------------------------------------------------
# Title-identical Short-circuit TESTS
# ---------------------------------------------------------------------------
class TestTitleIdenticalShortCircuit:
@pytest.mark.asyncio
async def test_identical_titles_skip_embedding(self):
"""Controls with identical titles in same hint group → direct link."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
controls = [
_make_control("m", reqs=3, hint="implement:mfa:none",
title="MFA implementieren"),
_make_control("c", reqs=1, hint="implement:mfa:none",
title="MFA implementieren"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock) as mock_embed, \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
# Embedding should only be called for the master (indexing), not for linking
assert runner.stats["linked"] == 1
assert runner.stats["skipped_title_identical"] == 1
@pytest.mark.asyncio
async def test_different_titles_use_embedding(self):
"""Controls with different titles should use embedding check."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
controls = [
_make_control("m", reqs=3, hint="implement:mfa:none",
title="MFA implementieren fuer Admins"),
_make_control("c", reqs=1, hint="implement:mfa:none",
title="MFA einrichten fuer alle Benutzer"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024) as mock_embed, \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True), \
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
new_callable=AsyncMock, return_value=[]):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
# Different titles → embedding was called for both (master + candidate)
assert mock_embed.call_count >= 2
# No Qdrant results → linked anyway (same hint = same action+object)
assert runner.stats["linked"] == 1
# ---------------------------------------------------------------------------
# Cross-Group Pass TESTS
# ---------------------------------------------------------------------------
class TestCrossGroupPass:
@pytest.mark.asyncio
async def test_cross_group_creates_link(self):
db = MagicMock()
db.commit = MagicMock()
# First call returns masters, subsequent calls return empty (for transfer)
master_rows = [
("uuid-1", "CTRL-001", "MFA implementieren",
"implement:multi_factor_auth:none"),
]
call_count = {"n": 0}
def mock_execute(stmt, params=None):
result = MagicMock()
call_count["n"] += 1
if call_count["n"] == 1:
result.fetchall.return_value = master_rows
else:
result.fetchall.return_value = []
return result
db.execute = mock_execute
runner = BatchDedupRunner(db=db)
cross_result = [{
"score": 0.95,
"payload": {
"control_uuid": "uuid-2",
"control_id": "CTRL-002",
"merge_group_hint": "implement:mfa:continuous",
},
}]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
new_callable=AsyncMock, return_value=cross_result):
await runner._run_cross_group_pass()
assert runner.stats["cross_group_linked"] == 1
# ---------------------------------------------------------------------------
# Progress Stats TESTS
# ---------------------------------------------------------------------------
class TestProgressStats:
def test_get_status(self):
db = MagicMock()
runner = BatchDedupRunner(db=db)
runner.stats["masters"] = 42
runner.stats["linked"] = 100
runner._progress_phase = "phase1"
runner._progress_count = 500
runner._progress_total = 85000
status = runner.get_status()
assert status["phase"] == "phase1"
assert status["progress"] == 500
assert status["total"] == 85000
assert status["masters"] == 42
assert status["linked"] == 100
# ---------------------------------------------------------------------------
# Route endpoint TESTS
# ---------------------------------------------------------------------------
class TestBatchDedupRoutes:
"""Test the batch-dedup API endpoints."""
def test_status_endpoint_not_running(self):
from fastapi import FastAPI
from fastapi.testclient import TestClient
from compliance.api.crosswalk_routes import router
app = FastAPI()
app.include_router(router, prefix="/api/compliance")
client = TestClient(app)
with patch("compliance.api.crosswalk_routes.SessionLocal") as mock_session:
mock_db = MagicMock()
mock_session.return_value = mock_db
mock_db.execute.return_value.fetchone.return_value = (85000, 0, 85000)
resp = client.get("/api/compliance/v1/canonical/migrate/batch-dedup/status")
assert resp.status_code == 200
data = resp.json()
assert data["running"] is False
# ---------------------------------------------------------------------------
# HELPERS
# ---------------------------------------------------------------------------
def _make_control(
prefix: str,
reqs: int = 0,
tests: int = 0,
evidence: int = 0,
hint: str = "",
title: str = None,
pattern_id: str = None,
) -> dict:
"""Build a mock control dict for testing."""
return {
"uuid": f"{prefix}-uuid",
"control_id": f"CTRL-{prefix}",
"title": title or f"Control {prefix}",
"objective": f"Objective for {prefix}",
"pattern_id": pattern_id,
"requirements": json.dumps([f"r{i}" for i in range(reqs)]),
"test_procedure": json.dumps([f"t{i}" for i in range(tests)]),
"evidence": json.dumps([f"e{i}" for i in range(evidence)]),
"release_state": "draft",
"merge_group_hint": hint,
"action_object_class": "",
}
@@ -1,17 +1,36 @@
"""Tests for Canonical Control Library routes (canonical_control_routes.py)."""
"""Tests for Canonical Control Library routes (canonical_control_routes.py).
Includes:
- Model validation tests (FrameworkResponse, ControlResponse, etc.)
- _control_row conversion tests
- Server-side pagination, sorting, search, source filter tests
- /controls-count and /controls-meta endpoint tests
"""
import pytest
from unittest.mock import MagicMock, patch
from datetime import datetime, timezone
from fastapi import FastAPI
from fastapi.testclient import TestClient
from compliance.api.canonical_control_routes import (
FrameworkResponse,
ControlResponse,
SimilarityCheckRequest,
SimilarityCheckResponse,
_control_row,
router,
)
# ---------------------------------------------------------------------------
# TestClient setup for endpoint tests
# ---------------------------------------------------------------------------
_app = FastAPI()
_app.include_router(router, prefix="/api/compliance")
_client = TestClient(_app)
class TestFrameworkResponse:
"""Tests for FrameworkResponse model."""
@@ -175,6 +194,12 @@ class TestControlRowConversion:
],
"release_state": "draft",
"tags": ["mfa"],
"generation_strategy": "ungrouped",
"parent_control_uuid": None,
"parent_control_id": None,
"parent_control_title": None,
"decomposition_method": None,
"pipeline_version": None,
"created_at": now,
"updated_at": now,
}
@@ -223,3 +248,300 @@ class TestControlRowConversion:
result = _control_row(row)
assert result["created_at"] is None
assert result["updated_at"] is None
def test_generation_strategy_default(self):
row = self._make_row()
result = _control_row(row)
assert result["generation_strategy"] == "ungrouped"
def test_generation_strategy_document_grouped(self):
row = self._make_row(generation_strategy="document_grouped")
result = _control_row(row)
assert result["generation_strategy"] == "document_grouped"
# =============================================================================
# ENDPOINT TESTS — Server-Side Pagination, Sort, Search, Source Filter
# =============================================================================
def _make_mock_row(**overrides):
"""Build a mock Row with all canonical_controls columns."""
now = datetime.now(timezone.utc)
defaults = {
"id": "uuid-ctrl-1",
"framework_id": "uuid-fw-1",
"control_id": "AUTH-001",
"title": "Test Control",
"objective": "Test obj",
"rationale": "Test rat",
"scope": {},
"requirements": ["Req 1"],
"test_procedure": ["Test 1"],
"evidence": [],
"severity": "high",
"risk_score": 3.0,
"implementation_effort": "m",
"evidence_confidence": None,
"open_anchors": [],
"release_state": "draft",
"tags": [],
"license_rule": 1,
"source_original_text": None,
"source_citation": None,
"customer_visible": True,
"verification_method": "automated",
"category": "authentication",
"target_audience": "developer",
"generation_metadata": {},
"generation_strategy": "ungrouped",
"created_at": now,
"updated_at": now,
}
defaults.update(overrides)
mock = MagicMock()
for k, v in defaults.items():
setattr(mock, k, v)
return mock
def _session_returning(rows=None, scalar=None):
"""Create a mock SessionLocal that returns rows or scalar."""
db = MagicMock()
result = MagicMock()
if rows is not None:
result.fetchall.return_value = rows
if scalar is not None:
result.scalar.return_value = scalar
db.execute.return_value = result
db.__enter__ = MagicMock(return_value=db)
db.__exit__ = MagicMock(return_value=False)
return db
class TestListControlsPagination:
"""GET /controls with limit/offset."""
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_limit_param_in_sql(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[_make_mock_row()])
resp = _client.get("/api/compliance/v1/canonical/controls?limit=10&offset=20")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "LIMIT" in sql
assert "OFFSET" in sql
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_no_limit_by_default(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[])
resp = _client.get("/api/compliance/v1/canonical/controls")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "LIMIT" not in sql
class TestListControlsSorting:
"""GET /controls with sort/order."""
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_sort_created_at_desc(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[])
resp = _client.get("/api/compliance/v1/canonical/controls?sort=created_at&order=desc")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "created_at DESC" in sql
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_default_sort_control_id_asc(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[])
resp = _client.get("/api/compliance/v1/canonical/controls")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "control_id ASC" in sql
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_sql_injection_in_sort_blocked(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[])
resp = _client.get("/api/compliance/v1/canonical/controls?sort=1;DROP+TABLE")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "DROP" not in sql
assert "control_id" in sql
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_sort_by_source(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[])
resp = _client.get("/api/compliance/v1/canonical/controls?sort=source&order=asc")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "source_citation" in sql
assert "control_id ASC" in sql # secondary sort within source group
class TestListControlsSearch:
"""GET /controls with search."""
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_search_uses_ilike(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[])
resp = _client.get("/api/compliance/v1/canonical/controls?search=encryption")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "ILIKE" in sql
params = mock_cls.return_value.__enter__().execute.call_args[0][1]
assert params["q"] == "%encryption%"
class TestListControlsSourceFilter:
"""GET /controls with source filter."""
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_specific_source(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[])
resp = _client.get("/api/compliance/v1/canonical/controls?source=DSGVO")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "source_citation" in sql
params = mock_cls.return_value.__enter__().execute.call_args[0][1]
assert params["src"] == "DSGVO"
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_no_source_filter(self, mock_cls):
mock_cls.return_value = _session_returning(rows=[])
resp = _client.get("/api/compliance/v1/canonical/controls?source=__none__")
assert resp.status_code == 200
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "IS NULL" in sql
class TestControlsCount:
"""GET /controls-count."""
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_returns_total(self, mock_cls):
mock_cls.return_value = _session_returning(scalar=42)
resp = _client.get("/api/compliance/v1/canonical/controls-count")
assert resp.status_code == 200
assert resp.json() == {"total": 42}
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_with_filters(self, mock_cls):
mock_cls.return_value = _session_returning(scalar=5)
resp = _client.get("/api/compliance/v1/canonical/controls-count?severity=critical&search=mfa")
assert resp.status_code == 200
assert resp.json() == {"total": 5}
sql = str(mock_cls.return_value.__enter__().execute.call_args[0][0].text)
assert "severity" in sql
assert "ILIKE" in sql
class TestControlsMeta:
"""GET /controls-meta."""
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_returns_structure(self, mock_cls):
db = MagicMock()
db.__enter__ = MagicMock(return_value=db)
db.__exit__ = MagicMock(return_value=False)
# Faceted meta does many execute() calls — use a default mock
scalar_r = MagicMock()
scalar_r.scalar.return_value = 100
scalar_r.fetchall.return_value = []
db.execute.return_value = scalar_r
mock_cls.return_value = db
resp = _client.get("/api/compliance/v1/canonical/controls-meta")
assert resp.status_code == 200
data = resp.json()
assert data["total"] == 100
assert isinstance(data["domains"], list)
assert isinstance(data["sources"], list)
assert "type_counts" in data
assert "severity_counts" in data
assert "verification_method_counts" in data
assert "category_counts" in data
assert "evidence_type_counts" in data
assert "release_state_counts" in data
class TestObligationDedup:
"""Tests for obligation deduplication endpoints."""
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_dedup_dry_run(self, mock_cls):
db = MagicMock()
db.__enter__ = MagicMock(return_value=db)
db.__exit__ = MagicMock(return_value=False)
mock_cls.return_value = db
# Mock: 2 duplicate groups
dup_row1 = MagicMock(candidate_id="OC-AUTH-001-01", cnt=3)
dup_row2 = MagicMock(candidate_id="OC-AUTH-001-02", cnt=2)
# Entries for group 1
import uuid
uid1 = uuid.uuid4()
uid2 = uuid.uuid4()
uid3 = uuid.uuid4()
entry1 = MagicMock(id=uid1, candidate_id="OC-AUTH-001-01", obligation_text="Text A", release_state="composed", created_at=datetime(2026, 1, 1, tzinfo=timezone.utc))
entry2 = MagicMock(id=uid2, candidate_id="OC-AUTH-001-01", obligation_text="Text B", release_state="composed", created_at=datetime(2026, 1, 2, tzinfo=timezone.utc))
entry3 = MagicMock(id=uid3, candidate_id="OC-AUTH-001-01", obligation_text="Text C", release_state="composed", created_at=datetime(2026, 1, 3, tzinfo=timezone.utc))
# Entries for group 2
uid4 = uuid.uuid4()
uid5 = uuid.uuid4()
entry4 = MagicMock(id=uid4, candidate_id="OC-AUTH-001-02", obligation_text="Text D", release_state="composed", created_at=datetime(2026, 1, 1, tzinfo=timezone.utc))
entry5 = MagicMock(id=uid5, candidate_id="OC-AUTH-001-02", obligation_text="Text E", release_state="composed", created_at=datetime(2026, 1, 2, tzinfo=timezone.utc))
# Side effects: 1) dup groups, 2) total count, 3) entries grp1, 4) entries grp2
mock_result_groups = MagicMock()
mock_result_groups.fetchall.return_value = [dup_row1, dup_row2]
mock_result_total = MagicMock()
mock_result_total.scalar.return_value = 2
mock_result_entries1 = MagicMock()
mock_result_entries1.fetchall.return_value = [entry1, entry2, entry3]
mock_result_entries2 = MagicMock()
mock_result_entries2.fetchall.return_value = [entry4, entry5]
db.execute.side_effect = [mock_result_groups, mock_result_total, mock_result_entries1, mock_result_entries2]
resp = _client.post("/api/compliance/v1/canonical/obligations/dedup?dry_run=true")
assert resp.status_code == 200
data = resp.json()
assert data["dry_run"] is True
assert data["stats"]["total_duplicate_groups"] == 2
assert data["stats"]["kept"] == 2
assert data["stats"]["marked_duplicate"] == 3 # 2 from grp1 + 1 from grp2
# Dry run: no commit
db.commit.assert_not_called()
@patch("compliance.api.canonical_control_routes.SessionLocal")
def test_dedup_stats(self, mock_cls):
db = MagicMock()
db.__enter__ = MagicMock(return_value=db)
db.__exit__ = MagicMock(return_value=False)
mock_cls.return_value = db
# total, by_state, dup_groups, removable
mock_total = MagicMock()
mock_total.scalar.return_value = 76046
mock_states = MagicMock()
mock_states.fetchall.return_value = [
MagicMock(release_state="composed", cnt=41217),
MagicMock(release_state="duplicate", cnt=34829),
]
mock_dup_groups = MagicMock()
mock_dup_groups.scalar.return_value = 0
mock_removable = MagicMock()
mock_removable.scalar.return_value = 0
db.execute.side_effect = [mock_total, mock_states, mock_dup_groups, mock_removable]
resp = _client.get("/api/compliance/v1/canonical/obligations/dedup-stats")
assert resp.status_code == 200
data = resp.json()
assert data["total_obligations"] == 76046
assert data["by_state"]["composed"] == 41217
assert data["by_state"]["duplicate"] == 34829
assert data["pending_duplicate_groups"] == 0
assert data["pending_removable_duplicates"] == 0

Some files were not shown because too many files have changed in this diff Show More