merge: sync with origin/main, take upstream on conflicts
# Conflicts: # admin-compliance/lib/sdk/types.ts # admin-compliance/lib/sdk/vendor-compliance/types.ts
This commit is contained in:
@@ -6,6 +6,8 @@ from .routes import router
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_failed_routers: dict[str, str] = {}
|
||||
|
||||
|
||||
def _safe_import_router(module_name: str, attr: str = "router"):
|
||||
"""Import a router module safely — log error but don't crash the whole app."""
|
||||
@@ -14,6 +16,7 @@ def _safe_import_router(module_name: str, attr: str = "router"):
|
||||
return getattr(mod, attr)
|
||||
except Exception as e:
|
||||
logger.error("Failed to import %s: %s", module_name, e)
|
||||
_failed_routers[module_name] = str(e)
|
||||
return None
|
||||
|
||||
|
||||
@@ -53,6 +56,13 @@ _ROUTER_MODULES = [
|
||||
"wiki_routes",
|
||||
"canonical_control_routes",
|
||||
"control_generator_routes",
|
||||
"crosswalk_routes",
|
||||
"process_task_routes",
|
||||
"evidence_check_routes",
|
||||
"vvt_library_routes",
|
||||
"tom_mapping_routes",
|
||||
"llm_audit_routes",
|
||||
"assertion_routes",
|
||||
]
|
||||
|
||||
_loaded_count = 0
|
||||
|
||||
227
backend-compliance/compliance/api/assertion_routes.py
Normal file
227
backend-compliance/compliance/api/assertion_routes.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""
|
||||
API routes for Assertion Engine (Anti-Fake-Evidence Phase 2).
|
||||
|
||||
Endpoints:
|
||||
- /assertions: CRUD for assertions
|
||||
- /assertions/extract: Automatic extraction from entity text
|
||||
- /assertions/summary: Stats (total assertions, facts, unverified)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
|
||||
from ..db.models import AssertionDB
|
||||
from ..services.assertion_engine import extract_assertions
|
||||
from .schemas import (
|
||||
AssertionCreate,
|
||||
AssertionUpdate,
|
||||
AssertionResponse,
|
||||
AssertionListResponse,
|
||||
AssertionSummaryResponse,
|
||||
AssertionExtractRequest,
|
||||
)
|
||||
from .audit_trail_utils import log_audit_trail, generate_id
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["compliance-assertions"])
|
||||
|
||||
|
||||
def _build_assertion_response(a: AssertionDB) -> AssertionResponse:
|
||||
return AssertionResponse(
|
||||
id=a.id,
|
||||
tenant_id=a.tenant_id,
|
||||
entity_type=a.entity_type,
|
||||
entity_id=a.entity_id,
|
||||
sentence_text=a.sentence_text,
|
||||
sentence_index=a.sentence_index,
|
||||
assertion_type=a.assertion_type,
|
||||
evidence_ids=a.evidence_ids or [],
|
||||
confidence=a.confidence or 0.0,
|
||||
normative_tier=a.normative_tier,
|
||||
verified_by=a.verified_by,
|
||||
verified_at=a.verified_at,
|
||||
created_at=a.created_at,
|
||||
updated_at=a.updated_at,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/assertions", response_model=AssertionResponse)
|
||||
async def create_assertion(
|
||||
data: AssertionCreate,
|
||||
tenant_id: Optional[str] = Query(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Create a single assertion manually."""
|
||||
a = AssertionDB(
|
||||
id=generate_id(),
|
||||
tenant_id=tenant_id,
|
||||
entity_type=data.entity_type,
|
||||
entity_id=data.entity_id,
|
||||
sentence_text=data.sentence_text,
|
||||
assertion_type=data.assertion_type or "assertion",
|
||||
evidence_ids=data.evidence_ids or [],
|
||||
normative_tier=data.normative_tier,
|
||||
)
|
||||
db.add(a)
|
||||
db.commit()
|
||||
db.refresh(a)
|
||||
return _build_assertion_response(a)
|
||||
|
||||
|
||||
@router.get("/assertions", response_model=AssertionListResponse)
|
||||
async def list_assertions(
|
||||
entity_type: Optional[str] = Query(None),
|
||||
entity_id: Optional[str] = Query(None),
|
||||
assertion_type: Optional[str] = Query(None),
|
||||
tenant_id: Optional[str] = Query(None),
|
||||
limit: int = Query(100, ge=1, le=500),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""List assertions with optional filters."""
|
||||
query = db.query(AssertionDB)
|
||||
if entity_type:
|
||||
query = query.filter(AssertionDB.entity_type == entity_type)
|
||||
if entity_id:
|
||||
query = query.filter(AssertionDB.entity_id == entity_id)
|
||||
if assertion_type:
|
||||
query = query.filter(AssertionDB.assertion_type == assertion_type)
|
||||
if tenant_id:
|
||||
query = query.filter(AssertionDB.tenant_id == tenant_id)
|
||||
|
||||
total = query.count()
|
||||
records = query.order_by(AssertionDB.sentence_index.asc()).limit(limit).all()
|
||||
|
||||
return AssertionListResponse(
|
||||
assertions=[_build_assertion_response(a) for a in records],
|
||||
total=total,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/assertions/summary", response_model=AssertionSummaryResponse)
|
||||
async def assertion_summary(
|
||||
tenant_id: Optional[str] = Query(None),
|
||||
entity_type: Optional[str] = Query(None),
|
||||
entity_id: Optional[str] = Query(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Summary stats: total assertions, facts, rationale, unverified."""
|
||||
query = db.query(AssertionDB)
|
||||
if tenant_id:
|
||||
query = query.filter(AssertionDB.tenant_id == tenant_id)
|
||||
if entity_type:
|
||||
query = query.filter(AssertionDB.entity_type == entity_type)
|
||||
if entity_id:
|
||||
query = query.filter(AssertionDB.entity_id == entity_id)
|
||||
|
||||
all_records = query.all()
|
||||
|
||||
total = len(all_records)
|
||||
facts = sum(1 for a in all_records if a.assertion_type == "fact")
|
||||
rationale = sum(1 for a in all_records if a.assertion_type == "rationale")
|
||||
unverified = sum(1 for a in all_records if a.assertion_type == "assertion" and not a.verified_by)
|
||||
|
||||
return AssertionSummaryResponse(
|
||||
total_assertions=total,
|
||||
total_facts=facts,
|
||||
total_rationale=rationale,
|
||||
unverified_count=unverified,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/assertions/{assertion_id}", response_model=AssertionResponse)
|
||||
async def get_assertion(
|
||||
assertion_id: str,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Get a single assertion by ID."""
|
||||
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
|
||||
if not a:
|
||||
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
|
||||
return _build_assertion_response(a)
|
||||
|
||||
|
||||
@router.put("/assertions/{assertion_id}", response_model=AssertionResponse)
|
||||
async def update_assertion(
|
||||
assertion_id: str,
|
||||
data: AssertionUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Update an assertion (e.g. link evidence, change type)."""
|
||||
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
|
||||
if not a:
|
||||
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
|
||||
|
||||
update_fields = data.model_dump(exclude_unset=True)
|
||||
for key, value in update_fields.items():
|
||||
setattr(a, key, value)
|
||||
a.updated_at = datetime.utcnow()
|
||||
db.commit()
|
||||
db.refresh(a)
|
||||
return _build_assertion_response(a)
|
||||
|
||||
|
||||
@router.post("/assertions/{assertion_id}/verify", response_model=AssertionResponse)
|
||||
async def verify_assertion(
|
||||
assertion_id: str,
|
||||
verified_by: str = Query(...),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Mark an assertion as verified fact."""
|
||||
a = db.query(AssertionDB).filter(AssertionDB.id == assertion_id).first()
|
||||
if not a:
|
||||
raise HTTPException(status_code=404, detail=f"Assertion {assertion_id} not found")
|
||||
|
||||
a.assertion_type = "fact"
|
||||
a.verified_by = verified_by
|
||||
a.verified_at = datetime.utcnow()
|
||||
a.updated_at = datetime.utcnow()
|
||||
db.commit()
|
||||
db.refresh(a)
|
||||
return _build_assertion_response(a)
|
||||
|
||||
|
||||
@router.post("/assertions/extract", response_model=AssertionListResponse)
|
||||
async def extract_assertions_endpoint(
|
||||
data: AssertionExtractRequest,
|
||||
tenant_id: Optional[str] = Query(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Extract assertions from free text and persist them."""
|
||||
extracted = extract_assertions(
|
||||
text=data.text,
|
||||
entity_type=data.entity_type,
|
||||
entity_id=data.entity_id,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
created = []
|
||||
for item in extracted:
|
||||
a = AssertionDB(
|
||||
id=generate_id(),
|
||||
tenant_id=item["tenant_id"],
|
||||
entity_type=item["entity_type"],
|
||||
entity_id=item["entity_id"],
|
||||
sentence_text=item["sentence_text"],
|
||||
sentence_index=item["sentence_index"],
|
||||
assertion_type=item["assertion_type"],
|
||||
evidence_ids=item["evidence_ids"],
|
||||
normative_tier=item.get("normative_tier"),
|
||||
confidence=item.get("confidence", 0.0),
|
||||
)
|
||||
db.add(a)
|
||||
created.append(a)
|
||||
|
||||
db.commit()
|
||||
for a in created:
|
||||
db.refresh(a)
|
||||
|
||||
return AssertionListResponse(
|
||||
assertions=[_build_assertion_response(a) for a in created],
|
||||
total=len(created),
|
||||
)
|
||||
53
backend-compliance/compliance/api/audit_trail_utils.py
Normal file
53
backend-compliance/compliance/api/audit_trail_utils.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Shared audit trail utilities.
|
||||
|
||||
Extracted from isms_routes.py for reuse across evidence, control,
|
||||
and assertion routes.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ..db.models import AuditTrailDB
|
||||
|
||||
|
||||
def generate_id() -> str:
|
||||
"""Generate a UUID string."""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def create_signature(data: str) -> str:
|
||||
"""Create SHA-256 signature."""
|
||||
return hashlib.sha256(data.encode()).hexdigest()
|
||||
|
||||
|
||||
def log_audit_trail(
|
||||
db: Session,
|
||||
entity_type: str,
|
||||
entity_id: str,
|
||||
entity_name: str,
|
||||
action: str,
|
||||
performed_by: str,
|
||||
field_changed: str = None,
|
||||
old_value: str = None,
|
||||
new_value: str = None,
|
||||
change_summary: str = None,
|
||||
):
|
||||
"""Log an entry to the audit trail."""
|
||||
trail = AuditTrailDB(
|
||||
id=generate_id(),
|
||||
entity_type=entity_type,
|
||||
entity_id=entity_id,
|
||||
entity_name=entity_name,
|
||||
action=action,
|
||||
field_changed=field_changed,
|
||||
old_value=old_value,
|
||||
new_value=new_value,
|
||||
change_summary=change_summary,
|
||||
performed_by=performed_by,
|
||||
performed_at=datetime.utcnow(),
|
||||
checksum=create_signature(f"{entity_type}|{entity_id}|{action}|{performed_by}"),
|
||||
)
|
||||
db.add(trail)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -12,6 +12,7 @@ Endpoints:
|
||||
POST /v1/canonical/blocked-sources/cleanup — Start cleanup workflow
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
@@ -25,7 +26,16 @@ from compliance.services.control_generator import (
|
||||
ControlGeneratorPipeline,
|
||||
GeneratorConfig,
|
||||
ALL_COLLECTIONS,
|
||||
VALID_CATEGORIES,
|
||||
VALID_DOMAINS,
|
||||
_classify_regulation,
|
||||
_detect_category,
|
||||
_detect_domain,
|
||||
_llm_local,
|
||||
_parse_llm_json,
|
||||
CATEGORY_LIST_STR,
|
||||
)
|
||||
from compliance.services.citation_backfill import CitationBackfill, BackfillResult
|
||||
from compliance.services.rag_client import get_rag_client
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -40,9 +50,12 @@ class GenerateRequest(BaseModel):
|
||||
domain: Optional[str] = None
|
||||
collections: Optional[List[str]] = None
|
||||
max_controls: int = 50
|
||||
max_chunks: int = 1000 # Default: process max 1000 chunks per job (respects document boundaries)
|
||||
batch_size: int = 5
|
||||
skip_web_search: bool = False
|
||||
dry_run: bool = False
|
||||
regulation_filter: Optional[List[str]] = None # Only process these regulation_code prefixes
|
||||
skip_prefilter: bool = False # Skip local LLM pre-filter, send all chunks to API
|
||||
|
||||
|
||||
class GenerateResponse(BaseModel):
|
||||
@@ -55,6 +68,7 @@ class GenerateResponse(BaseModel):
|
||||
controls_needs_review: int = 0
|
||||
controls_too_close: int = 0
|
||||
controls_duplicates_found: int = 0
|
||||
controls_qa_fixed: int = 0
|
||||
errors: list = []
|
||||
controls: list = []
|
||||
|
||||
@@ -89,42 +103,111 @@ class BlockedSourceResponse(BaseModel):
|
||||
# ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
async def _run_pipeline_background(config: GeneratorConfig, job_id: str):
|
||||
"""Run the pipeline in the background. Uses its own DB session."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
config.existing_job_id = job_id
|
||||
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
|
||||
result = await pipeline.run(config)
|
||||
logger.info(
|
||||
"Background generation job %s completed: %d controls from %d chunks",
|
||||
job_id, result.controls_generated, result.total_chunks_scanned,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Background generation job %s failed: %s", job_id, e)
|
||||
# Update job as failed
|
||||
try:
|
||||
db.execute(
|
||||
text("""
|
||||
UPDATE canonical_generation_jobs
|
||||
SET status = 'failed', errors = :errors, completed_at = NOW()
|
||||
WHERE id = CAST(:job_id AS uuid)
|
||||
"""),
|
||||
{"job_id": job_id, "errors": json.dumps([str(e)])},
|
||||
)
|
||||
db.commit()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/generate", response_model=GenerateResponse)
|
||||
async def start_generation(req: GenerateRequest):
|
||||
"""Start a control generation run."""
|
||||
"""Start a control generation run (runs in background).
|
||||
|
||||
Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress.
|
||||
"""
|
||||
config = GeneratorConfig(
|
||||
collections=req.collections,
|
||||
domain=req.domain,
|
||||
batch_size=req.batch_size,
|
||||
max_controls=req.max_controls,
|
||||
max_chunks=req.max_chunks,
|
||||
skip_web_search=req.skip_web_search,
|
||||
dry_run=req.dry_run,
|
||||
regulation_filter=req.regulation_filter,
|
||||
skip_prefilter=req.skip_prefilter,
|
||||
)
|
||||
|
||||
if req.dry_run:
|
||||
# Dry run: execute synchronously and return controls
|
||||
db = SessionLocal()
|
||||
try:
|
||||
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
|
||||
result = await pipeline.run(config)
|
||||
return GenerateResponse(
|
||||
job_id=result.job_id,
|
||||
status=result.status,
|
||||
message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
|
||||
total_chunks_scanned=result.total_chunks_scanned,
|
||||
controls_generated=result.controls_generated,
|
||||
controls_verified=result.controls_verified,
|
||||
controls_needs_review=result.controls_needs_review,
|
||||
controls_too_close=result.controls_too_close,
|
||||
controls_duplicates_found=result.controls_duplicates_found,
|
||||
errors=result.errors,
|
||||
controls=result.controls,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Dry run failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
# Create job record first so we can return the ID
|
||||
db = SessionLocal()
|
||||
try:
|
||||
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
|
||||
result = await pipeline.run(config)
|
||||
|
||||
return GenerateResponse(
|
||||
job_id=result.job_id,
|
||||
status=result.status,
|
||||
message=f"Generated {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
|
||||
total_chunks_scanned=result.total_chunks_scanned,
|
||||
controls_generated=result.controls_generated,
|
||||
controls_verified=result.controls_verified,
|
||||
controls_needs_review=result.controls_needs_review,
|
||||
controls_too_close=result.controls_too_close,
|
||||
controls_duplicates_found=result.controls_duplicates_found,
|
||||
errors=result.errors,
|
||||
controls=result.controls if req.dry_run else [],
|
||||
result = db.execute(
|
||||
text("""
|
||||
INSERT INTO canonical_generation_jobs (status, config)
|
||||
VALUES ('running', :config)
|
||||
RETURNING id
|
||||
"""),
|
||||
{"config": json.dumps(config.model_dump())},
|
||||
)
|
||||
db.commit()
|
||||
row = result.fetchone()
|
||||
job_id = str(row[0]) if row else None
|
||||
except Exception as e:
|
||||
logger.error("Generation failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
logger.error("Failed to create job: %s", e)
|
||||
raise HTTPException(status_code=500, detail=f"Failed to create job: {e}")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if not job_id:
|
||||
raise HTTPException(status_code=500, detail="Failed to create job record")
|
||||
|
||||
# Launch pipeline in background
|
||||
asyncio.create_task(_run_pipeline_background(config, job_id))
|
||||
|
||||
return GenerateResponse(
|
||||
job_id=job_id,
|
||||
status="running",
|
||||
message="Generation started in background. Poll /generate/status/{job_id} for progress.",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/generate/status/{job_id}")
|
||||
async def get_job_status(job_id: str):
|
||||
@@ -132,7 +215,7 @@ async def get_job_status(job_id: str):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
result = db.execute(
|
||||
text("SELECT * FROM canonical_generation_jobs WHERE id = :id::uuid"),
|
||||
text("SELECT * FROM canonical_generation_jobs WHERE id = CAST(:id AS uuid)"),
|
||||
{"id": job_id},
|
||||
)
|
||||
row = result.fetchone()
|
||||
@@ -270,6 +353,188 @@ async def review_control(control_id: str, req: ReviewRequest):
|
||||
db.close()
|
||||
|
||||
|
||||
class BulkReviewRequest(BaseModel):
|
||||
release_state: str # Filter: which controls to bulk-review
|
||||
action: str # "approve" or "reject"
|
||||
new_state: Optional[str] = None # Override target state
|
||||
|
||||
|
||||
@router.post("/generate/bulk-review")
|
||||
async def bulk_review(req: BulkReviewRequest):
|
||||
"""Bulk review all controls matching a release_state filter.
|
||||
|
||||
Example: reject all needs_review → sets them to deprecated.
|
||||
"""
|
||||
if req.release_state not in ("needs_review", "too_close", "duplicate"):
|
||||
raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
|
||||
|
||||
if req.action == "approve":
|
||||
target = req.new_state or "draft"
|
||||
elif req.action == "reject":
|
||||
target = "deprecated"
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
|
||||
|
||||
if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
|
||||
raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
result = db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET release_state = :target, updated_at = NOW()
|
||||
WHERE release_state = :source
|
||||
RETURNING control_id
|
||||
"""),
|
||||
{"source": req.release_state, "target": target},
|
||||
)
|
||||
affected = [row[0] for row in result]
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"action": req.action,
|
||||
"source_state": req.release_state,
|
||||
"target_state": target,
|
||||
"affected_count": len(affected),
|
||||
}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
class QAReclassifyRequest(BaseModel):
|
||||
limit: int = 100 # How many controls to reclassify per run
|
||||
dry_run: bool = True # Preview only by default
|
||||
filter_category: Optional[str] = None # Only reclassify controls of this category
|
||||
filter_domain_prefix: Optional[str] = None # Only reclassify controls with this prefix
|
||||
|
||||
|
||||
@router.post("/generate/qa-reclassify")
|
||||
async def qa_reclassify(req: QAReclassifyRequest):
|
||||
"""Run QA reclassification on existing controls using local LLM.
|
||||
|
||||
Finds controls where keyword-detection disagrees with current category/domain,
|
||||
then uses Ollama to determine the correct classification.
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Load controls to check
|
||||
where_clauses = ["release_state NOT IN ('deprecated')"]
|
||||
params = {"limit": req.limit}
|
||||
if req.filter_category:
|
||||
where_clauses.append("category = :cat")
|
||||
params["cat"] = req.filter_category
|
||||
if req.filter_domain_prefix:
|
||||
where_clauses.append("control_id LIKE :prefix")
|
||||
params["prefix"] = f"{req.filter_domain_prefix}-%"
|
||||
|
||||
where_sql = " AND ".join(where_clauses)
|
||||
rows = db.execute(
|
||||
text(f"""
|
||||
SELECT id, control_id, title, objective, category,
|
||||
COALESCE(requirements::text, '[]') as requirements,
|
||||
COALESCE(source_original_text, '') as source_text
|
||||
FROM canonical_controls
|
||||
WHERE {where_sql}
|
||||
ORDER BY created_at DESC
|
||||
LIMIT :limit
|
||||
"""),
|
||||
params,
|
||||
).fetchall()
|
||||
|
||||
results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
|
||||
|
||||
for row in rows:
|
||||
results["checked"] += 1
|
||||
control_id = row[1]
|
||||
title = row[2]
|
||||
objective = row[3] or ""
|
||||
current_category = row[4]
|
||||
source_text = row[6] or objective
|
||||
|
||||
# Keyword detection on source text
|
||||
kw_category = _detect_category(source_text) or _detect_category(objective)
|
||||
kw_domain = _detect_domain(source_text)
|
||||
current_prefix = control_id.split("-")[0] if "-" in control_id else ""
|
||||
|
||||
# Skip if keyword detection agrees with current classification
|
||||
if kw_category == current_category and kw_domain == current_prefix:
|
||||
continue
|
||||
|
||||
results["mismatches"] += 1
|
||||
|
||||
# Ask Ollama to arbitrate
|
||||
try:
|
||||
reqs_text = ""
|
||||
try:
|
||||
reqs = json.loads(row[5])
|
||||
if isinstance(reqs, list):
|
||||
reqs_text = ", ".join(str(r) for r in reqs[:3])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
|
||||
|
||||
Titel: {title[:100]}
|
||||
Ziel: {objective[:200]}
|
||||
Anforderungen: {reqs_text[:200]}
|
||||
|
||||
Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
|
||||
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
|
||||
|
||||
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
|
||||
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
|
||||
|
||||
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
|
||||
Kategorien: {CATEGORY_LIST_STR}"""
|
||||
|
||||
raw = await _llm_local(prompt)
|
||||
data = _parse_llm_json(raw)
|
||||
if not data:
|
||||
continue
|
||||
|
||||
qa_domain = data.get("domain", "").upper()
|
||||
qa_category = data.get("category", "")
|
||||
reason = data.get("reason", "")
|
||||
|
||||
fix_entry = {
|
||||
"control_id": control_id,
|
||||
"title": title[:80],
|
||||
"old_category": current_category,
|
||||
"old_domain": current_prefix,
|
||||
"new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
|
||||
"new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
|
||||
"reason": reason,
|
||||
}
|
||||
|
||||
category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
|
||||
|
||||
if category_changed and not req.dry_run:
|
||||
db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET category = :category, updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{"id": row[0], "category": qa_category},
|
||||
)
|
||||
fix_entry["applied"] = True
|
||||
else:
|
||||
fix_entry["applied"] = False
|
||||
|
||||
results["fixes"].append(fix_entry)
|
||||
|
||||
except Exception as e:
|
||||
results["errors"].append({"control_id": control_id, "error": str(e)})
|
||||
|
||||
if not req.dry_run:
|
||||
db.commit()
|
||||
|
||||
return results
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/generate/processed-stats")
|
||||
async def get_processed_stats():
|
||||
"""Get processing statistics per collection."""
|
||||
@@ -429,3 +694,407 @@ async def get_controls_customer_view(
|
||||
return {"controls": controls, "total": len(controls)}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CITATION BACKFILL
|
||||
# =============================================================================
|
||||
|
||||
class BackfillRequest(BaseModel):
|
||||
dry_run: bool = True # Default to dry_run for safety
|
||||
limit: int = 0 # 0 = all controls
|
||||
|
||||
|
||||
class BackfillResponse(BaseModel):
|
||||
status: str
|
||||
total_controls: int = 0
|
||||
matched_hash: int = 0
|
||||
matched_regex: int = 0
|
||||
matched_llm: int = 0
|
||||
unmatched: int = 0
|
||||
updated: int = 0
|
||||
errors: list = []
|
||||
|
||||
|
||||
_backfill_status: dict = {}
|
||||
|
||||
|
||||
async def _run_backfill_background(dry_run: bool, limit: int, backfill_id: str):
|
||||
"""Run backfill in background with own DB session."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
backfill = CitationBackfill(db=db, rag_client=get_rag_client())
|
||||
result = await backfill.run(dry_run=dry_run, limit=limit)
|
||||
_backfill_status[backfill_id] = {
|
||||
"status": "completed",
|
||||
"total_controls": result.total_controls,
|
||||
"matched_hash": result.matched_hash,
|
||||
"matched_regex": result.matched_regex,
|
||||
"matched_llm": result.matched_llm,
|
||||
"unmatched": result.unmatched,
|
||||
"updated": result.updated,
|
||||
"errors": result.errors[:50],
|
||||
}
|
||||
logger.info("Backfill %s completed: %d updated", backfill_id, result.updated)
|
||||
except Exception as e:
|
||||
logger.error("Backfill %s failed: %s", backfill_id, e)
|
||||
_backfill_status[backfill_id] = {"status": "failed", "errors": [str(e)]}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/generate/backfill-citations", response_model=BackfillResponse)
|
||||
async def start_backfill(req: BackfillRequest):
|
||||
"""Backfill article/paragraph into existing control source_citations.
|
||||
|
||||
Uses 3-tier matching: hash lookup → regex parse → Ollama LLM.
|
||||
Default is dry_run=True (preview only, no DB changes).
|
||||
"""
|
||||
import uuid
|
||||
backfill_id = str(uuid.uuid4())[:8]
|
||||
_backfill_status[backfill_id] = {"status": "running"}
|
||||
|
||||
# Always run in background (RAG index build takes minutes)
|
||||
asyncio.create_task(_run_backfill_background(req.dry_run, req.limit, backfill_id))
|
||||
return BackfillResponse(
|
||||
status=f"running (id={backfill_id})",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/generate/backfill-status/{backfill_id}")
|
||||
async def get_backfill_status(backfill_id: str):
|
||||
"""Get status of a backfill job."""
|
||||
status = _backfill_status.get(backfill_id)
|
||||
if not status:
|
||||
raise HTTPException(status_code=404, detail="Backfill job not found")
|
||||
return status
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DOMAIN + TARGET AUDIENCE BACKFILL
|
||||
# =============================================================================
|
||||
|
||||
class DomainBackfillRequest(BaseModel):
|
||||
dry_run: bool = True
|
||||
job_id: Optional[str] = None # Only backfill controls from this job
|
||||
limit: int = 0 # 0 = all
|
||||
|
||||
_domain_backfill_status: dict = {}
|
||||
|
||||
|
||||
async def _run_domain_backfill(req: DomainBackfillRequest, backfill_id: str):
|
||||
"""Backfill domain, category, and target_audience for existing controls using Anthropic."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
||||
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
||||
|
||||
if not ANTHROPIC_API_KEY:
|
||||
_domain_backfill_status[backfill_id] = {
|
||||
"status": "failed", "error": "ANTHROPIC_API_KEY not set"
|
||||
}
|
||||
return
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Find controls needing backfill
|
||||
where_clauses = ["(target_audience IS NULL OR target_audience = '[]' OR target_audience = 'null')"]
|
||||
params: dict = {}
|
||||
if req.job_id:
|
||||
where_clauses.append("generation_metadata->>'job_id' = :job_id")
|
||||
params["job_id"] = req.job_id
|
||||
|
||||
query = f"""
|
||||
SELECT id, control_id, title, objective, category, source_original_text, tags
|
||||
FROM canonical_controls
|
||||
WHERE {' AND '.join(where_clauses)}
|
||||
ORDER BY control_id
|
||||
"""
|
||||
if req.limit > 0:
|
||||
query += f" LIMIT {req.limit}"
|
||||
|
||||
result = db.execute(text(query), params)
|
||||
controls = [dict(zip(result.keys(), row)) for row in result]
|
||||
|
||||
total = len(controls)
|
||||
updated = 0
|
||||
errors = []
|
||||
|
||||
_domain_backfill_status[backfill_id] = {
|
||||
"status": "running", "total": total, "updated": 0, "errors": []
|
||||
}
|
||||
|
||||
# Process in batches of 10
|
||||
BATCH_SIZE = 10
|
||||
for batch_start in range(0, total, BATCH_SIZE):
|
||||
batch = controls[batch_start:batch_start + BATCH_SIZE]
|
||||
|
||||
entries = []
|
||||
for idx, ctrl in enumerate(batch):
|
||||
text_for_analysis = ctrl.get("objective") or ctrl.get("title") or ""
|
||||
original = ctrl.get("source_original_text") or ""
|
||||
if original:
|
||||
text_for_analysis += f"\n\nQuelltext-Auszug: {original[:500]}"
|
||||
entries.append(
|
||||
f"--- CONTROL {idx + 1}: {ctrl['control_id']} ---\n"
|
||||
f"Titel: {ctrl.get('title', '')}\n"
|
||||
f"Objective: {text_for_analysis[:800]}\n"
|
||||
f"Tags: {json.dumps(ctrl.get('tags', []))}"
|
||||
)
|
||||
|
||||
prompt = f"""Analysiere die folgenden {len(batch)} Controls und bestimme fuer jedes:
|
||||
1. domain: Das Fachgebiet (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
|
||||
2. category: Die Kategorie (encryption, authentication, network, data_protection, logging, incident, continuity, compliance, supply_chain, physical, personnel, application, system, risk, governance, hardware, identity, public_administration, labor_law, finance, trade_regulation, environmental, health)
|
||||
3. target_audience: Liste der Zielgruppen (moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
|
||||
|
||||
Antworte mit einem JSON-Array mit {len(batch)} Objekten. Jedes Objekt hat:
|
||||
- control_index: 1-basierter Index
|
||||
- domain: Fachgebiet-Kuerzel
|
||||
- category: Kategorie
|
||||
- target_audience: Liste der Zielgruppen
|
||||
|
||||
{"".join(entries)}"""
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Fachgebiet und Zielgruppe. Antworte NUR mit validem JSON.",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
resp = await client.post(
|
||||
"https://api.anthropic.com/v1/messages",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
errors.append(f"Anthropic API {resp.status_code} at batch {batch_start}")
|
||||
continue
|
||||
|
||||
raw = resp.json().get("content", [{}])[0].get("text", "")
|
||||
|
||||
# Parse response
|
||||
import re
|
||||
bracket_match = re.search(r"\[.*\]", raw, re.DOTALL)
|
||||
if not bracket_match:
|
||||
errors.append(f"No JSON array in response at batch {batch_start}")
|
||||
continue
|
||||
|
||||
results_list = json.loads(bracket_match.group(0))
|
||||
|
||||
for item in results_list:
|
||||
idx = item.get("control_index", 0) - 1
|
||||
if idx < 0 or idx >= len(batch):
|
||||
continue
|
||||
ctrl = batch[idx]
|
||||
ctrl_id = str(ctrl["id"])
|
||||
|
||||
new_domain = item.get("domain", "")
|
||||
new_category = item.get("category", "")
|
||||
new_audience = item.get("target_audience", [])
|
||||
|
||||
if not isinstance(new_audience, list):
|
||||
new_audience = []
|
||||
|
||||
# Build new control_id from domain if domain changed
|
||||
old_prefix = ctrl["control_id"].split("-")[0] if ctrl["control_id"] else ""
|
||||
new_prefix = new_domain.upper()[:4] if new_domain else old_prefix
|
||||
|
||||
if not req.dry_run:
|
||||
update_parts = []
|
||||
update_params: dict = {"ctrl_id": ctrl_id}
|
||||
|
||||
if new_category:
|
||||
update_parts.append("category = :category")
|
||||
update_params["category"] = new_category
|
||||
|
||||
if new_audience:
|
||||
update_parts.append("target_audience = :target_audience")
|
||||
update_params["target_audience"] = json.dumps(new_audience)
|
||||
|
||||
# Note: We do NOT rename control_ids here — that would
|
||||
# break references and cause unique constraint violations.
|
||||
|
||||
if update_parts:
|
||||
update_parts.append("updated_at = NOW()")
|
||||
db.execute(
|
||||
text(f"UPDATE canonical_controls SET {', '.join(update_parts)} WHERE id = CAST(:ctrl_id AS uuid)"),
|
||||
update_params,
|
||||
)
|
||||
updated += 1
|
||||
|
||||
if not req.dry_run:
|
||||
db.commit()
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Batch {batch_start}: {str(e)}")
|
||||
db.rollback()
|
||||
|
||||
_domain_backfill_status[backfill_id] = {
|
||||
"status": "running", "total": total, "updated": updated,
|
||||
"progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}",
|
||||
"errors": errors[-10:],
|
||||
}
|
||||
|
||||
_domain_backfill_status[backfill_id] = {
|
||||
"status": "completed", "total": total, "updated": updated,
|
||||
"errors": errors[-50:],
|
||||
}
|
||||
logger.info("Domain backfill %s completed: %d/%d updated", backfill_id, updated, total)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Domain backfill %s failed: %s", backfill_id, e)
|
||||
_domain_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/generate/backfill-domain")
|
||||
async def start_domain_backfill(req: DomainBackfillRequest):
|
||||
"""Backfill domain, category, and target_audience for controls using Anthropic API.
|
||||
|
||||
Finds controls where target_audience is NULL and enriches them.
|
||||
Default is dry_run=True (preview only).
|
||||
"""
|
||||
import uuid
|
||||
backfill_id = str(uuid.uuid4())[:8]
|
||||
_domain_backfill_status[backfill_id] = {"status": "starting"}
|
||||
asyncio.create_task(_run_domain_backfill(req, backfill_id))
|
||||
return {"status": "running", "backfill_id": backfill_id,
|
||||
"message": f"Domain backfill started. Poll /generate/backfill-status/{backfill_id}"}
|
||||
|
||||
|
||||
@router.get("/generate/domain-backfill-status/{backfill_id}")
|
||||
async def get_domain_backfill_status(backfill_id: str):
|
||||
"""Get status of a domain backfill job."""
|
||||
status = _domain_backfill_status.get(backfill_id)
|
||||
if not status:
|
||||
raise HTTPException(status_code=404, detail="Domain backfill job not found")
|
||||
return status
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Source-Type Backfill — Classify law vs guideline vs standard vs restricted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class SourceTypeBackfillRequest(BaseModel):
|
||||
dry_run: bool = True
|
||||
|
||||
|
||||
_source_type_backfill_status: dict = {}
|
||||
|
||||
|
||||
async def _run_source_type_backfill(dry_run: bool, backfill_id: str):
|
||||
"""Backfill source_type into source_citation JSONB for all controls."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Find controls with source_citation that lack source_type
|
||||
rows = db.execute(text("""
|
||||
SELECT control_id, source_citation, generation_metadata
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation IS NOT NULL
|
||||
AND (source_citation->>'source_type' IS NULL
|
||||
OR source_citation->>'source_type' = '')
|
||||
""")).fetchall()
|
||||
|
||||
total = len(rows)
|
||||
updated = 0
|
||||
already_correct = 0
|
||||
errors = []
|
||||
|
||||
_source_type_backfill_status[backfill_id] = {
|
||||
"status": "running", "total": total, "updated": 0, "dry_run": dry_run,
|
||||
}
|
||||
|
||||
for row in rows:
|
||||
cid = row[0]
|
||||
citation = row[1] if isinstance(row[1], dict) else json.loads(row[1] or "{}")
|
||||
metadata = row[2] if isinstance(row[2], dict) else json.loads(row[2] or "{}")
|
||||
|
||||
# Get regulation_code from metadata
|
||||
reg_code = metadata.get("source_regulation", "")
|
||||
if not reg_code:
|
||||
# Try to infer from source name
|
||||
errors.append(f"{cid}: no source_regulation in metadata")
|
||||
continue
|
||||
|
||||
# Classify
|
||||
license_info = _classify_regulation(reg_code)
|
||||
source_type = license_info.get("source_type", "restricted")
|
||||
|
||||
# Update citation
|
||||
citation["source_type"] = source_type
|
||||
|
||||
if not dry_run:
|
||||
db.execute(text("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = :citation
|
||||
WHERE control_id = :cid
|
||||
"""), {"citation": json.dumps(citation), "cid": cid})
|
||||
if updated % 100 == 0:
|
||||
db.commit()
|
||||
updated += 1
|
||||
|
||||
if not dry_run:
|
||||
db.commit()
|
||||
|
||||
# Count distribution
|
||||
dist_query = db.execute(text("""
|
||||
SELECT source_citation->>'source_type' as st, COUNT(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation IS NOT NULL
|
||||
AND source_citation->>'source_type' IS NOT NULL
|
||||
GROUP BY st
|
||||
""")).fetchall() if not dry_run else []
|
||||
|
||||
distribution = {r[0]: r[1] for r in dist_query}
|
||||
|
||||
_source_type_backfill_status[backfill_id] = {
|
||||
"status": "completed", "total": total, "updated": updated,
|
||||
"dry_run": dry_run, "distribution": distribution,
|
||||
"errors": errors[:50],
|
||||
}
|
||||
logger.info("Source-type backfill %s completed: %d/%d updated (dry_run=%s)",
|
||||
backfill_id, updated, total, dry_run)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Source-type backfill %s failed: %s", backfill_id, e)
|
||||
_source_type_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/generate/backfill-source-type")
|
||||
async def start_source_type_backfill(req: SourceTypeBackfillRequest):
|
||||
"""Backfill source_type (law/guideline/standard/restricted) into source_citation JSONB.
|
||||
|
||||
Classifies each control's source as binding law, authority guideline,
|
||||
voluntary standard, or restricted norm based on regulation_code.
|
||||
Default is dry_run=True (preview only).
|
||||
"""
|
||||
import uuid
|
||||
backfill_id = str(uuid.uuid4())[:8]
|
||||
_source_type_backfill_status[backfill_id] = {"status": "starting"}
|
||||
asyncio.create_task(_run_source_type_backfill(req.dry_run, backfill_id))
|
||||
return {
|
||||
"status": "running",
|
||||
"backfill_id": backfill_id,
|
||||
"message": f"Source-type backfill started. Poll /generate/source-type-backfill-status/{backfill_id}",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/generate/source-type-backfill-status/{backfill_id}")
|
||||
async def get_source_type_backfill_status(backfill_id: str):
|
||||
"""Get status of a source-type backfill job."""
|
||||
status = _source_type_backfill_status.get(backfill_id)
|
||||
if not status:
|
||||
raise HTTPException(status_code=404, detail="Source-type backfill job not found")
|
||||
return status
|
||||
|
||||
856
backend-compliance/compliance/api/crosswalk_routes.py
Normal file
856
backend-compliance/compliance/api/crosswalk_routes.py
Normal file
@@ -0,0 +1,856 @@
|
||||
"""
|
||||
FastAPI routes for the Multi-Layer Control Architecture.
|
||||
|
||||
Pattern Library, Obligation Extraction, Crosswalk Matrix, and Migration endpoints.
|
||||
|
||||
Endpoints:
|
||||
GET /v1/canonical/patterns — All patterns (with filters)
|
||||
GET /v1/canonical/patterns/{pattern_id} — Single pattern
|
||||
GET /v1/canonical/patterns/{pattern_id}/controls — Controls for a pattern
|
||||
|
||||
POST /v1/canonical/obligations/extract — Extract obligations from text
|
||||
GET /v1/canonical/crosswalk — Query crosswalk matrix
|
||||
GET /v1/canonical/crosswalk/stats — Coverage statistics
|
||||
|
||||
POST /v1/canonical/migrate/decompose — Pass 0a: Obligation extraction
|
||||
POST /v1/canonical/migrate/merge-obligations — Merge implementation-level dupes
|
||||
POST /v1/canonical/migrate/enrich-obligations — Add trigger_type, impl metadata
|
||||
POST /v1/canonical/migrate/compose-atomic — Pass 0b: Atomic control composition
|
||||
POST /v1/canonical/migrate/link-obligations — Pass 1: Obligation linkage
|
||||
POST /v1/canonical/migrate/classify-patterns — Pass 2: Pattern classification
|
||||
POST /v1/canonical/migrate/triage — Pass 3: Quality triage
|
||||
POST /v1/canonical/migrate/backfill-crosswalk — Pass 4: Crosswalk backfill
|
||||
POST /v1/canonical/migrate/deduplicate — Pass 5: Deduplication
|
||||
GET /v1/canonical/migrate/status — Migration progress
|
||||
GET /v1/canonical/migrate/decomposition-status — Decomposition progress
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import text
|
||||
|
||||
from database import SessionLocal
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/v1/canonical", tags=["crosswalk"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# REQUEST / RESPONSE MODELS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class PatternResponse(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
name_de: str
|
||||
domain: str
|
||||
category: str
|
||||
description: str
|
||||
objective_template: str
|
||||
severity_default: str
|
||||
implementation_effort_default: str = "m"
|
||||
tags: list = []
|
||||
composable_with: list = []
|
||||
open_anchor_refs: list = []
|
||||
controls_count: int = 0
|
||||
|
||||
|
||||
class PatternListResponse(BaseModel):
|
||||
patterns: List[PatternResponse]
|
||||
total: int
|
||||
|
||||
|
||||
class PatternDetailResponse(PatternResponse):
|
||||
rationale_template: str = ""
|
||||
requirements_template: list = []
|
||||
test_procedure_template: list = []
|
||||
evidence_template: list = []
|
||||
obligation_match_keywords: list = []
|
||||
|
||||
|
||||
class ObligationExtractRequest(BaseModel):
|
||||
text: str
|
||||
regulation_code: Optional[str] = None
|
||||
article: Optional[str] = None
|
||||
paragraph: Optional[str] = None
|
||||
|
||||
|
||||
class ObligationExtractResponse(BaseModel):
|
||||
obligation_id: Optional[str] = None
|
||||
obligation_title: Optional[str] = None
|
||||
obligation_text: Optional[str] = None
|
||||
method: str = "none"
|
||||
confidence: float = 0.0
|
||||
regulation_id: Optional[str] = None
|
||||
pattern_id: Optional[str] = None
|
||||
pattern_confidence: float = 0.0
|
||||
|
||||
|
||||
class CrosswalkRow(BaseModel):
|
||||
regulation_code: str = ""
|
||||
article: Optional[str] = None
|
||||
obligation_id: Optional[str] = None
|
||||
pattern_id: Optional[str] = None
|
||||
master_control_id: Optional[str] = None
|
||||
confidence: float = 0.0
|
||||
source: str = "auto"
|
||||
|
||||
|
||||
class CrosswalkQueryResponse(BaseModel):
|
||||
rows: List[CrosswalkRow]
|
||||
total: int
|
||||
|
||||
|
||||
class CrosswalkStatsResponse(BaseModel):
|
||||
total_rows: int = 0
|
||||
regulations_covered: int = 0
|
||||
obligations_linked: int = 0
|
||||
patterns_used: int = 0
|
||||
controls_linked: int = 0
|
||||
coverage_by_regulation: dict = {}
|
||||
|
||||
|
||||
class MigrationRequest(BaseModel):
|
||||
limit: int = 0 # 0 = no limit
|
||||
batch_size: int = 0 # 0 = auto (5 for Anthropic, 1 for Ollama)
|
||||
use_anthropic: bool = False # Use Anthropic API instead of Ollama
|
||||
category_filter: Optional[str] = None # Comma-separated categories
|
||||
source_filter: Optional[str] = None # Comma-separated source regulations (ILIKE match)
|
||||
|
||||
|
||||
class BatchSubmitRequest(BaseModel):
|
||||
limit: int = 0
|
||||
batch_size: int = 5
|
||||
category_filter: Optional[str] = None
|
||||
source_filter: Optional[str] = None
|
||||
|
||||
|
||||
class BatchProcessRequest(BaseModel):
|
||||
batch_id: str
|
||||
pass_type: str = "0a" # "0a" or "0b"
|
||||
|
||||
|
||||
class MigrationResponse(BaseModel):
|
||||
status: str = "completed"
|
||||
stats: dict = {}
|
||||
|
||||
|
||||
class MigrationStatusResponse(BaseModel):
|
||||
total_controls: int = 0
|
||||
has_obligation: int = 0
|
||||
has_pattern: int = 0
|
||||
fully_linked: int = 0
|
||||
deprecated: int = 0
|
||||
coverage_obligation_pct: float = 0.0
|
||||
coverage_pattern_pct: float = 0.0
|
||||
coverage_full_pct: float = 0.0
|
||||
|
||||
|
||||
class DecompositionStatusResponse(BaseModel):
|
||||
rich_controls: int = 0
|
||||
decomposed_controls: int = 0
|
||||
total_candidates: int = 0
|
||||
validated: int = 0
|
||||
rejected: int = 0
|
||||
composed: int = 0
|
||||
atomic_controls: int = 0
|
||||
merged: int = 0
|
||||
enriched: int = 0
|
||||
ready_for_pass0b: int = 0
|
||||
decomposition_pct: float = 0.0
|
||||
composition_pct: float = 0.0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PATTERN LIBRARY ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@router.get("/patterns", response_model=PatternListResponse)
|
||||
async def list_patterns(
|
||||
domain: Optional[str] = Query(None, description="Filter by domain (e.g. AUTH, CRYP)"),
|
||||
category: Optional[str] = Query(None, description="Filter by category"),
|
||||
tag: Optional[str] = Query(None, description="Filter by tag"),
|
||||
):
|
||||
"""List all control patterns with optional filters."""
|
||||
from compliance.services.pattern_matcher import PatternMatcher
|
||||
|
||||
matcher = PatternMatcher()
|
||||
matcher._load_patterns()
|
||||
matcher._build_keyword_index()
|
||||
|
||||
patterns = matcher._patterns
|
||||
|
||||
if domain:
|
||||
patterns = [p for p in patterns if p.domain == domain.upper()]
|
||||
if category:
|
||||
patterns = [p for p in patterns if p.category == category.lower()]
|
||||
if tag:
|
||||
patterns = [p for p in patterns if tag.lower() in [t.lower() for t in p.tags]]
|
||||
|
||||
# Count controls per pattern from DB
|
||||
control_counts = _get_pattern_control_counts()
|
||||
|
||||
response_patterns = []
|
||||
for p in patterns:
|
||||
response_patterns.append(PatternResponse(
|
||||
id=p.id,
|
||||
name=p.name,
|
||||
name_de=p.name_de,
|
||||
domain=p.domain,
|
||||
category=p.category,
|
||||
description=p.description,
|
||||
objective_template=p.objective_template,
|
||||
severity_default=p.severity_default,
|
||||
implementation_effort_default=p.implementation_effort_default,
|
||||
tags=p.tags,
|
||||
composable_with=p.composable_with,
|
||||
open_anchor_refs=p.open_anchor_refs,
|
||||
controls_count=control_counts.get(p.id, 0),
|
||||
))
|
||||
|
||||
return PatternListResponse(patterns=response_patterns, total=len(response_patterns))
|
||||
|
||||
|
||||
@router.get("/patterns/{pattern_id}", response_model=PatternDetailResponse)
|
||||
async def get_pattern(pattern_id: str):
|
||||
"""Get a single control pattern by ID."""
|
||||
from compliance.services.pattern_matcher import PatternMatcher
|
||||
|
||||
matcher = PatternMatcher()
|
||||
matcher._load_patterns()
|
||||
|
||||
pattern = matcher.get_pattern(pattern_id)
|
||||
if not pattern:
|
||||
raise HTTPException(status_code=404, detail=f"Pattern {pattern_id} not found")
|
||||
|
||||
control_counts = _get_pattern_control_counts()
|
||||
|
||||
return PatternDetailResponse(
|
||||
id=pattern.id,
|
||||
name=pattern.name,
|
||||
name_de=pattern.name_de,
|
||||
domain=pattern.domain,
|
||||
category=pattern.category,
|
||||
description=pattern.description,
|
||||
objective_template=pattern.objective_template,
|
||||
rationale_template=pattern.rationale_template,
|
||||
requirements_template=pattern.requirements_template,
|
||||
test_procedure_template=pattern.test_procedure_template,
|
||||
evidence_template=pattern.evidence_template,
|
||||
severity_default=pattern.severity_default,
|
||||
implementation_effort_default=pattern.implementation_effort_default,
|
||||
tags=pattern.tags,
|
||||
composable_with=pattern.composable_with,
|
||||
open_anchor_refs=pattern.open_anchor_refs,
|
||||
obligation_match_keywords=pattern.obligation_match_keywords,
|
||||
controls_count=control_counts.get(pattern.id, 0),
|
||||
)
|
||||
|
||||
|
||||
@router.get("/patterns/{pattern_id}/controls")
|
||||
async def get_pattern_controls(
|
||||
pattern_id: str,
|
||||
limit: int = Query(50, ge=1, le=500),
|
||||
offset: int = Query(0, ge=0),
|
||||
):
|
||||
"""Get controls generated from a specific pattern."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
result = db.execute(
|
||||
text("""
|
||||
SELECT id, control_id, title, objective, severity,
|
||||
release_state, category, obligation_ids
|
||||
FROM canonical_controls
|
||||
WHERE pattern_id = :pattern_id
|
||||
AND release_state NOT IN ('deprecated')
|
||||
ORDER BY control_id
|
||||
LIMIT :limit OFFSET :offset
|
||||
"""),
|
||||
{"pattern_id": pattern_id.upper(), "limit": limit, "offset": offset},
|
||||
)
|
||||
rows = result.fetchall()
|
||||
|
||||
count_result = db.execute(
|
||||
text("""
|
||||
SELECT count(*) FROM canonical_controls
|
||||
WHERE pattern_id = :pattern_id
|
||||
AND release_state NOT IN ('deprecated')
|
||||
"""),
|
||||
{"pattern_id": pattern_id.upper()},
|
||||
)
|
||||
total = count_result.fetchone()[0]
|
||||
|
||||
controls = []
|
||||
for row in rows:
|
||||
obl_ids = row[7]
|
||||
if isinstance(obl_ids, str):
|
||||
try:
|
||||
obl_ids = json.loads(obl_ids)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
obl_ids = []
|
||||
controls.append({
|
||||
"id": str(row[0]),
|
||||
"control_id": row[1],
|
||||
"title": row[2],
|
||||
"objective": row[3],
|
||||
"severity": row[4],
|
||||
"release_state": row[5],
|
||||
"category": row[6],
|
||||
"obligation_ids": obl_ids or [],
|
||||
})
|
||||
|
||||
return {"controls": controls, "total": total}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OBLIGATION EXTRACTION ENDPOINT
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@router.post("/obligations/extract", response_model=ObligationExtractResponse)
|
||||
async def extract_obligation(req: ObligationExtractRequest):
|
||||
"""Extract obligation from text using 3-tier strategy, then match to pattern."""
|
||||
from compliance.services.obligation_extractor import ObligationExtractor
|
||||
from compliance.services.pattern_matcher import PatternMatcher
|
||||
|
||||
extractor = ObligationExtractor()
|
||||
await extractor.initialize()
|
||||
|
||||
obligation = await extractor.extract(
|
||||
chunk_text=req.text,
|
||||
regulation_code=req.regulation_code or "",
|
||||
article=req.article,
|
||||
paragraph=req.paragraph,
|
||||
)
|
||||
|
||||
# Also match to pattern
|
||||
matcher = PatternMatcher()
|
||||
matcher._load_patterns()
|
||||
matcher._build_keyword_index()
|
||||
|
||||
pattern_text = obligation.obligation_text or obligation.obligation_title or req.text[:500]
|
||||
pattern_result = matcher._tier1_keyword(pattern_text, obligation.regulation_id)
|
||||
|
||||
return ObligationExtractResponse(
|
||||
obligation_id=obligation.obligation_id,
|
||||
obligation_title=obligation.obligation_title,
|
||||
obligation_text=obligation.obligation_text,
|
||||
method=obligation.method,
|
||||
confidence=obligation.confidence,
|
||||
regulation_id=obligation.regulation_id,
|
||||
pattern_id=pattern_result.pattern_id if pattern_result else None,
|
||||
pattern_confidence=pattern_result.confidence if pattern_result else 0,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CROSSWALK MATRIX ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@router.get("/crosswalk", response_model=CrosswalkQueryResponse)
|
||||
async def query_crosswalk(
|
||||
regulation_code: Optional[str] = Query(None),
|
||||
article: Optional[str] = Query(None),
|
||||
obligation_id: Optional[str] = Query(None),
|
||||
pattern_id: Optional[str] = Query(None),
|
||||
limit: int = Query(100, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0),
|
||||
):
|
||||
"""Query the crosswalk matrix with filters."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
conditions = ["1=1"]
|
||||
params = {"limit": limit, "offset": offset}
|
||||
|
||||
if regulation_code:
|
||||
conditions.append("regulation_code = :reg")
|
||||
params["reg"] = regulation_code
|
||||
if article:
|
||||
conditions.append("article = :art")
|
||||
params["art"] = article
|
||||
if obligation_id:
|
||||
conditions.append("obligation_id = :obl")
|
||||
params["obl"] = obligation_id
|
||||
if pattern_id:
|
||||
conditions.append("pattern_id = :pat")
|
||||
params["pat"] = pattern_id
|
||||
|
||||
where = " AND ".join(conditions)
|
||||
|
||||
result = db.execute(
|
||||
text(f"""
|
||||
SELECT regulation_code, article, obligation_id,
|
||||
pattern_id, master_control_id, confidence, source
|
||||
FROM crosswalk_matrix
|
||||
WHERE {where}
|
||||
ORDER BY regulation_code, article
|
||||
LIMIT :limit OFFSET :offset
|
||||
"""),
|
||||
params,
|
||||
)
|
||||
rows = result.fetchall()
|
||||
|
||||
count_result = db.execute(
|
||||
text(f"SELECT count(*) FROM crosswalk_matrix WHERE {where}"),
|
||||
params,
|
||||
)
|
||||
total = count_result.fetchone()[0]
|
||||
|
||||
crosswalk_rows = [
|
||||
CrosswalkRow(
|
||||
regulation_code=r[0] or "",
|
||||
article=r[1],
|
||||
obligation_id=r[2],
|
||||
pattern_id=r[3],
|
||||
master_control_id=r[4],
|
||||
confidence=float(r[5] or 0),
|
||||
source=r[6] or "auto",
|
||||
)
|
||||
for r in rows
|
||||
]
|
||||
|
||||
return CrosswalkQueryResponse(rows=crosswalk_rows, total=total)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/crosswalk/stats", response_model=CrosswalkStatsResponse)
|
||||
async def crosswalk_stats():
|
||||
"""Get crosswalk coverage statistics."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
row = db.execute(text("""
|
||||
SELECT
|
||||
count(*) AS total,
|
||||
count(DISTINCT regulation_code) FILTER (WHERE regulation_code != '') AS regs,
|
||||
count(DISTINCT obligation_id) FILTER (WHERE obligation_id IS NOT NULL) AS obls,
|
||||
count(DISTINCT pattern_id) FILTER (WHERE pattern_id IS NOT NULL) AS pats,
|
||||
count(DISTINCT master_control_id) FILTER (WHERE master_control_id IS NOT NULL) AS ctrls
|
||||
FROM crosswalk_matrix
|
||||
""")).fetchone()
|
||||
|
||||
# Coverage by regulation
|
||||
reg_rows = db.execute(text("""
|
||||
SELECT regulation_code, count(*) AS cnt
|
||||
FROM crosswalk_matrix
|
||||
WHERE regulation_code != ''
|
||||
GROUP BY regulation_code
|
||||
ORDER BY cnt DESC
|
||||
""")).fetchall()
|
||||
|
||||
coverage = {r[0]: r[1] for r in reg_rows}
|
||||
|
||||
return CrosswalkStatsResponse(
|
||||
total_rows=row[0],
|
||||
regulations_covered=row[1],
|
||||
obligations_linked=row[2],
|
||||
patterns_used=row[3],
|
||||
controls_linked=row[4],
|
||||
coverage_by_regulation=coverage,
|
||||
)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MIGRATION ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@router.post("/migrate/decompose", response_model=MigrationResponse)
|
||||
async def migrate_decompose(req: MigrationRequest):
|
||||
"""Pass 0a: Extract obligation candidates from rich controls.
|
||||
|
||||
With use_anthropic=true, uses Anthropic API with prompt caching
|
||||
and content batching (multiple controls per API call).
|
||||
"""
|
||||
from compliance.services.decomposition_pass import DecompositionPass
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
decomp = DecompositionPass(db=db)
|
||||
stats = await decomp.run_pass0a(
|
||||
limit=req.limit,
|
||||
batch_size=req.batch_size,
|
||||
use_anthropic=req.use_anthropic,
|
||||
category_filter=req.category_filter,
|
||||
source_filter=req.source_filter,
|
||||
)
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Decomposition pass 0a failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/merge-obligations", response_model=MigrationResponse)
|
||||
async def migrate_merge_obligations():
|
||||
"""Merge implementation-level duplicate obligations within each parent.
|
||||
|
||||
Run AFTER Pass 0a, BEFORE Pass 0b. No LLM calls — rule-based.
|
||||
Merges obligations that share similar action+object into the more
|
||||
abstract survivor, marking the concrete duplicate as 'merged'.
|
||||
"""
|
||||
from compliance.services.decomposition_pass import DecompositionPass
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
decomp = DecompositionPass(db=db)
|
||||
stats = decomp.run_merge_pass()
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Merge pass failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/enrich-obligations", response_model=MigrationResponse)
|
||||
async def migrate_enrich_obligations():
|
||||
"""Add trigger_type and is_implementation_specific metadata.
|
||||
|
||||
Run AFTER merge pass, BEFORE Pass 0b. No LLM calls — rule-based.
|
||||
Classifies trigger_type (event/periodic/continuous) from obligation text
|
||||
and detects implementation-specific obligations (concrete tools/protocols).
|
||||
"""
|
||||
from compliance.services.decomposition_pass import DecompositionPass
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
decomp = DecompositionPass(db=db)
|
||||
stats = decomp.enrich_obligations()
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Enrich pass failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/compose-atomic", response_model=MigrationResponse)
|
||||
async def migrate_compose_atomic(req: MigrationRequest):
|
||||
"""Pass 0b: Compose atomic controls from obligation candidates.
|
||||
|
||||
With use_anthropic=true, uses Anthropic API with prompt caching
|
||||
and content batching (multiple obligations per API call).
|
||||
"""
|
||||
from compliance.services.decomposition_pass import DecompositionPass
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
decomp = DecompositionPass(db=db)
|
||||
stats = await decomp.run_pass0b(
|
||||
limit=req.limit,
|
||||
batch_size=req.batch_size,
|
||||
use_anthropic=req.use_anthropic,
|
||||
)
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Decomposition pass 0b failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/batch-submit-0a", response_model=MigrationResponse)
|
||||
async def batch_submit_pass0a(req: BatchSubmitRequest):
|
||||
"""Submit Pass 0a as Anthropic Batch API job (50% cost reduction).
|
||||
|
||||
Returns a batch_id for polling. Results are processed asynchronously
|
||||
within 24 hours by Anthropic.
|
||||
"""
|
||||
from compliance.services.decomposition_pass import DecompositionPass
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
decomp = DecompositionPass(db=db)
|
||||
result = await decomp.submit_batch_pass0a(
|
||||
limit=req.limit,
|
||||
batch_size=req.batch_size,
|
||||
category_filter=req.category_filter,
|
||||
source_filter=req.source_filter,
|
||||
)
|
||||
return MigrationResponse(status=result.pop("status", "submitted"), stats=result)
|
||||
except Exception as e:
|
||||
logger.error("Batch submit 0a failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/batch-submit-0b", response_model=MigrationResponse)
|
||||
async def batch_submit_pass0b(req: BatchSubmitRequest):
|
||||
"""Submit Pass 0b as Anthropic Batch API job (50% cost reduction)."""
|
||||
from compliance.services.decomposition_pass import DecompositionPass
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
decomp = DecompositionPass(db=db)
|
||||
result = await decomp.submit_batch_pass0b(
|
||||
limit=req.limit,
|
||||
batch_size=req.batch_size,
|
||||
)
|
||||
return MigrationResponse(status=result.pop("status", "submitted"), stats=result)
|
||||
except Exception as e:
|
||||
logger.error("Batch submit 0b failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/migrate/batch-status/{batch_id}")
|
||||
async def batch_check_status(batch_id: str):
|
||||
"""Check processing status of an Anthropic batch job."""
|
||||
from compliance.services.decomposition_pass import check_batch_status
|
||||
|
||||
try:
|
||||
status = await check_batch_status(batch_id)
|
||||
return status
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/migrate/batch-process", response_model=MigrationResponse)
|
||||
async def batch_process_results(req: BatchProcessRequest):
|
||||
"""Fetch and process results from a completed Anthropic batch.
|
||||
|
||||
Call this after batch-status shows processing_status='ended'.
|
||||
"""
|
||||
from compliance.services.decomposition_pass import DecompositionPass
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
decomp = DecompositionPass(db=db)
|
||||
stats = await decomp.process_batch_results(
|
||||
batch_id=req.batch_id,
|
||||
pass_type=req.pass_type,
|
||||
)
|
||||
return MigrationResponse(status=stats.pop("status", "completed"), stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Batch process failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/link-obligations", response_model=MigrationResponse)
|
||||
async def migrate_link_obligations(req: MigrationRequest):
|
||||
"""Pass 1: Link controls to obligations via source_citation article."""
|
||||
from compliance.services.pipeline_adapter import MigrationPasses
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
migration = MigrationPasses(db=db)
|
||||
await migration.initialize()
|
||||
stats = await migration.run_pass1_obligation_linkage(limit=req.limit)
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Migration pass 1 failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/classify-patterns", response_model=MigrationResponse)
|
||||
async def migrate_classify_patterns(req: MigrationRequest):
|
||||
"""Pass 2: Classify controls into patterns via keyword matching."""
|
||||
from compliance.services.pipeline_adapter import MigrationPasses
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
migration = MigrationPasses(db=db)
|
||||
await migration.initialize()
|
||||
stats = await migration.run_pass2_pattern_classification(limit=req.limit)
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Migration pass 2 failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/triage", response_model=MigrationResponse)
|
||||
async def migrate_triage():
|
||||
"""Pass 3: Quality triage — categorize by linkage completeness."""
|
||||
from compliance.services.pipeline_adapter import MigrationPasses
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
migration = MigrationPasses(db=db)
|
||||
stats = migration.run_pass3_quality_triage()
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Migration pass 3 failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/backfill-crosswalk", response_model=MigrationResponse)
|
||||
async def migrate_backfill_crosswalk():
|
||||
"""Pass 4: Create crosswalk rows for linked controls."""
|
||||
from compliance.services.pipeline_adapter import MigrationPasses
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
migration = MigrationPasses(db=db)
|
||||
stats = migration.run_pass4_crosswalk_backfill()
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Migration pass 4 failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/migrate/deduplicate", response_model=MigrationResponse)
|
||||
async def migrate_deduplicate():
|
||||
"""Pass 5: Mark duplicate controls (same obligation + pattern)."""
|
||||
from compliance.services.pipeline_adapter import MigrationPasses
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
migration = MigrationPasses(db=db)
|
||||
stats = migration.run_pass5_deduplication()
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Migration pass 5 failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/migrate/status", response_model=MigrationStatusResponse)
|
||||
async def migration_status():
|
||||
"""Get overall migration progress."""
|
||||
from compliance.services.pipeline_adapter import MigrationPasses
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
migration = MigrationPasses(db=db)
|
||||
status = migration.migration_status()
|
||||
return MigrationStatusResponse(**status)
|
||||
except Exception as e:
|
||||
logger.error("Migration status failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/migrate/decomposition-status", response_model=DecompositionStatusResponse)
|
||||
async def decomposition_status():
|
||||
"""Get decomposition progress (Pass 0a/0b)."""
|
||||
from compliance.services.decomposition_pass import DecompositionPass
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
decomp = DecompositionPass(db=db)
|
||||
status = decomp.decomposition_status()
|
||||
return DecompositionStatusResponse(**status)
|
||||
except Exception as e:
|
||||
logger.error("Decomposition status failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# BATCH DEDUP ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
# Module-level runner reference for status polling
|
||||
_batch_dedup_runner = None
|
||||
|
||||
|
||||
@router.post("/migrate/batch-dedup", response_model=MigrationResponse)
|
||||
async def migrate_batch_dedup(
|
||||
dry_run: bool = Query(False, description="Preview mode — no DB changes"),
|
||||
hint_filter: Optional[str] = Query(None, description="Only process hints matching this prefix"),
|
||||
):
|
||||
"""Batch dedup: reduce ~85k Pass 0b controls to ~18-25k masters.
|
||||
|
||||
Phase 1: Groups by merge_group_hint, picks best quality master, links rest.
|
||||
Phase 2: Cross-group embedding search for semantically similar masters.
|
||||
"""
|
||||
global _batch_dedup_runner
|
||||
from compliance.services.batch_dedup_runner import BatchDedupRunner
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
runner = BatchDedupRunner(db=db)
|
||||
_batch_dedup_runner = runner
|
||||
stats = await runner.run(dry_run=dry_run, hint_filter=hint_filter)
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Batch dedup failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
_batch_dedup_runner = None
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/migrate/batch-dedup/status")
|
||||
async def batch_dedup_status():
|
||||
"""Get current batch dedup progress (while running)."""
|
||||
if _batch_dedup_runner is not None:
|
||||
return {"running": True, **_batch_dedup_runner.get_status()}
|
||||
|
||||
# Not running — show DB stats
|
||||
db = SessionLocal()
|
||||
try:
|
||||
row = db.execute(text("""
|
||||
SELECT
|
||||
count(*) FILTER (WHERE decomposition_method = 'pass0b') AS total_pass0b,
|
||||
count(*) FILTER (WHERE decomposition_method = 'pass0b'
|
||||
AND release_state = 'duplicate') AS duplicates,
|
||||
count(*) FILTER (WHERE decomposition_method = 'pass0b'
|
||||
AND release_state != 'duplicate'
|
||||
AND release_state != 'deprecated') AS masters
|
||||
FROM canonical_controls
|
||||
""")).fetchone()
|
||||
review_count = db.execute(text(
|
||||
"SELECT count(*) FROM control_dedup_reviews WHERE review_status = 'pending'"
|
||||
)).fetchone()[0]
|
||||
return {
|
||||
"running": False,
|
||||
"total_pass0b": row[0],
|
||||
"duplicates": row[1],
|
||||
"masters": row[2],
|
||||
"pending_reviews": review_count,
|
||||
}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# HELPERS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _get_pattern_control_counts() -> dict[str, int]:
|
||||
"""Get count of controls per pattern_id from DB."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
result = db.execute(text("""
|
||||
SELECT pattern_id, count(*) AS cnt
|
||||
FROM canonical_controls
|
||||
WHERE pattern_id IS NOT NULL AND pattern_id != ''
|
||||
AND release_state NOT IN ('deprecated')
|
||||
GROUP BY pattern_id
|
||||
"""))
|
||||
return {row[0]: row[1] for row in result.fetchall()}
|
||||
except Exception:
|
||||
return {}
|
||||
finally:
|
||||
db.close()
|
||||
@@ -5,16 +5,23 @@ Endpoints:
|
||||
- /dashboard: Main compliance dashboard
|
||||
- /dashboard/executive: Executive summary for managers
|
||||
- /dashboard/trend: Compliance score trend over time
|
||||
- /dashboard/roadmap: Prioritised controls in 4 buckets
|
||||
- /dashboard/module-status: Completion status of each SDK module
|
||||
- /dashboard/next-actions: Top 5 most important actions
|
||||
- /dashboard/snapshot: Save / query compliance score snapshots
|
||||
- /score: Quick compliance score
|
||||
- /reports: Report generation
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from datetime import datetime, date, timedelta
|
||||
from calendar import month_abbr
|
||||
from typing import Optional
|
||||
from typing import Optional, Dict, Any, List
|
||||
from decimal import Decimal
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
@@ -25,15 +32,24 @@ from ..db import (
|
||||
ControlRepository,
|
||||
EvidenceRepository,
|
||||
RiskRepository,
|
||||
AssertionDB,
|
||||
)
|
||||
from .schemas import (
|
||||
DashboardResponse,
|
||||
MultiDimensionalScore,
|
||||
ExecutiveDashboardResponse,
|
||||
TrendDataPoint,
|
||||
RiskSummary,
|
||||
DeadlineItem,
|
||||
TeamWorkloadItem,
|
||||
TraceabilityAssertion,
|
||||
TraceabilityEvidence,
|
||||
TraceabilityCoverage,
|
||||
TraceabilityControl,
|
||||
TraceabilityMatrixResponse,
|
||||
)
|
||||
from .tenant_utils import get_tenant_id as _get_tenant_id
|
||||
from .db_utils import row_to_dict as _row_to_dict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["compliance-dashboard"])
|
||||
@@ -86,6 +102,14 @@ async def get_dashboard(db: Session = Depends(get_db)):
|
||||
# or compute from by_status dict
|
||||
score = ctrl_stats.get("compliance_score", 0.0)
|
||||
|
||||
# Multi-dimensional score (Anti-Fake-Evidence)
|
||||
try:
|
||||
ms = ctrl_repo.get_multi_dimensional_score()
|
||||
multi_score = MultiDimensionalScore(**ms)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to compute multi-dimensional score: {e}")
|
||||
multi_score = None
|
||||
|
||||
return DashboardResponse(
|
||||
compliance_score=round(score, 1),
|
||||
total_regulations=len(regulations),
|
||||
@@ -98,6 +122,7 @@ async def get_dashboard(db: Session = Depends(get_db)):
|
||||
total_risks=len(risks),
|
||||
risks_by_level=risks_by_level,
|
||||
recent_activity=[],
|
||||
multi_score=multi_score,
|
||||
)
|
||||
|
||||
|
||||
@@ -116,11 +141,18 @@ async def get_compliance_score(db: Session = Depends(get_db)):
|
||||
else:
|
||||
score = 0
|
||||
|
||||
# Multi-dimensional score (Anti-Fake-Evidence)
|
||||
try:
|
||||
multi_score = ctrl_repo.get_multi_dimensional_score()
|
||||
except Exception:
|
||||
multi_score = None
|
||||
|
||||
return {
|
||||
"score": round(score, 1),
|
||||
"total_controls": total,
|
||||
"passing_controls": passing,
|
||||
"partial_controls": partial,
|
||||
"multi_score": multi_score,
|
||||
}
|
||||
|
||||
|
||||
@@ -322,6 +354,424 @@ async def get_compliance_trend(
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Dashboard Extended — Roadmap, Module-Status, Next-Actions, Snapshots
|
||||
# ============================================================================
|
||||
|
||||
# Weight map for control prioritisation
|
||||
_PRIORITY_WEIGHTS = {"legal": 5, "security": 3, "best_practice": 1, "operational": 2}
|
||||
|
||||
# SDK module definitions → DB table used for counting completion
|
||||
_MODULE_DEFS: List[Dict[str, str]] = [
|
||||
{"key": "vvt", "label": "VVT", "table": "compliance_vvt_activities"},
|
||||
{"key": "tom", "label": "TOM", "table": "compliance_toms"},
|
||||
{"key": "dsfa", "label": "DSFA", "table": "compliance_dsfa_assessments"},
|
||||
{"key": "loeschfristen", "label": "Loeschfristen", "table": "compliance_loeschfristen"},
|
||||
{"key": "risks", "label": "Risiken", "table": "compliance_risks"},
|
||||
{"key": "controls", "label": "Controls", "table": "compliance_controls"},
|
||||
{"key": "evidence", "label": "Nachweise", "table": "compliance_evidence"},
|
||||
{"key": "obligations", "label": "Pflichten", "table": "compliance_obligations"},
|
||||
{"key": "incidents", "label": "Vorfaelle", "table": "compliance_notfallplan_incidents"},
|
||||
{"key": "vendor", "label": "Auftragsverarbeiter", "table": "compliance_vendor_assessments"},
|
||||
{"key": "legal_templates", "label": "Rechtl. Dokumente", "table": "compliance_legal_templates"},
|
||||
{"key": "training", "label": "Schulungen", "table": "training_modules"},
|
||||
{"key": "audit", "label": "Audit", "table": "compliance_audit_sessions"},
|
||||
{"key": "security_backlog", "label": "Security-Backlog", "table": "compliance_security_backlog"},
|
||||
{"key": "quality", "label": "Qualitaet", "table": "compliance_quality_items"},
|
||||
]
|
||||
|
||||
|
||||
@router.get("/dashboard/roadmap")
|
||||
async def get_dashboard_roadmap(
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Prioritised controls in 4 buckets: Quick Wins, Must Have, Should Have, Nice to Have."""
|
||||
ctrl_repo = ControlRepository(db)
|
||||
controls = ctrl_repo.get_all()
|
||||
today = datetime.utcnow().date()
|
||||
|
||||
buckets: Dict[str, list] = {
|
||||
"quick_wins": [],
|
||||
"must_have": [],
|
||||
"should_have": [],
|
||||
"nice_to_have": [],
|
||||
}
|
||||
|
||||
for ctrl in controls:
|
||||
status = ctrl.status.value if ctrl.status else "planned"
|
||||
if status == "pass":
|
||||
continue # already done
|
||||
|
||||
weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
|
||||
days_overdue = 0
|
||||
if ctrl.next_review_at:
|
||||
review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
|
||||
days_overdue = (today - review_date).days
|
||||
|
||||
urgency = weight * 2 + (1 if days_overdue > 0 else 0)
|
||||
|
||||
item = {
|
||||
"id": str(ctrl.id),
|
||||
"control_id": ctrl.control_id,
|
||||
"title": ctrl.title,
|
||||
"status": status,
|
||||
"domain": ctrl.domain.value if ctrl.domain else "unknown",
|
||||
"owner": ctrl.owner,
|
||||
"next_review_at": ctrl.next_review_at.isoformat() if ctrl.next_review_at else None,
|
||||
"days_overdue": max(0, days_overdue),
|
||||
"weight": weight,
|
||||
}
|
||||
|
||||
if weight >= 5 and days_overdue > 0:
|
||||
buckets["quick_wins"].append(item)
|
||||
elif weight >= 4:
|
||||
buckets["must_have"].append(item)
|
||||
elif weight >= 2:
|
||||
buckets["should_have"].append(item)
|
||||
else:
|
||||
buckets["nice_to_have"].append(item)
|
||||
|
||||
# Sort each bucket by urgency desc
|
||||
for key in buckets:
|
||||
buckets[key].sort(key=lambda x: x["days_overdue"], reverse=True)
|
||||
|
||||
return {
|
||||
"buckets": buckets,
|
||||
"counts": {k: len(v) for k, v in buckets.items()},
|
||||
"generated_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/dashboard/module-status")
|
||||
async def get_module_status(
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Completion status for each SDK module based on DB record counts."""
|
||||
modules = []
|
||||
for mod in _MODULE_DEFS:
|
||||
try:
|
||||
row = db.execute(
|
||||
text(f"SELECT COUNT(*) FROM {mod['table']} WHERE tenant_id = :tid"),
|
||||
{"tid": tenant_id},
|
||||
).fetchone()
|
||||
count = int(row[0]) if row else 0
|
||||
except Exception:
|
||||
count = 0
|
||||
|
||||
# Simple heuristic: 0 = not started, 1-2 = in progress, 3+ = complete
|
||||
if count == 0:
|
||||
status = "not_started"
|
||||
progress = 0
|
||||
elif count < 3:
|
||||
status = "in_progress"
|
||||
progress = min(60, count * 30)
|
||||
else:
|
||||
status = "complete"
|
||||
progress = 100
|
||||
|
||||
modules.append({
|
||||
"key": mod["key"],
|
||||
"label": mod["label"],
|
||||
"count": count,
|
||||
"status": status,
|
||||
"progress": progress,
|
||||
})
|
||||
|
||||
started = sum(1 for m in modules if m["status"] != "not_started")
|
||||
complete = sum(1 for m in modules if m["status"] == "complete")
|
||||
|
||||
return {
|
||||
"modules": modules,
|
||||
"total": len(modules),
|
||||
"started": started,
|
||||
"complete": complete,
|
||||
"overall_progress": round((complete / len(modules)) * 100, 1) if modules else 0,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/dashboard/next-actions")
|
||||
async def get_next_actions(
|
||||
limit: int = Query(5, ge=1, le=20),
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Top N most important actions sorted by urgency*impact."""
|
||||
ctrl_repo = ControlRepository(db)
|
||||
controls = ctrl_repo.get_all()
|
||||
today = datetime.utcnow().date()
|
||||
|
||||
actions = []
|
||||
for ctrl in controls:
|
||||
status = ctrl.status.value if ctrl.status else "planned"
|
||||
if status == "pass":
|
||||
continue
|
||||
|
||||
days_overdue = 0
|
||||
if ctrl.next_review_at:
|
||||
review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
|
||||
days_overdue = max(0, (today - review_date).days)
|
||||
|
||||
weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
|
||||
urgency_score = weight * 10 + days_overdue
|
||||
|
||||
actions.append({
|
||||
"id": str(ctrl.id),
|
||||
"control_id": ctrl.control_id,
|
||||
"title": ctrl.title,
|
||||
"status": status,
|
||||
"domain": ctrl.domain.value if ctrl.domain else "unknown",
|
||||
"owner": ctrl.owner,
|
||||
"days_overdue": days_overdue,
|
||||
"urgency_score": urgency_score,
|
||||
"reason": "Ueberfaellig" if days_overdue > 0 else "Offen",
|
||||
})
|
||||
|
||||
actions.sort(key=lambda x: x["urgency_score"], reverse=True)
|
||||
return {"actions": actions[:limit]}
|
||||
|
||||
|
||||
@router.post("/dashboard/snapshot")
|
||||
async def create_score_snapshot(
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Save current compliance score as a historical snapshot."""
|
||||
ctrl_repo = ControlRepository(db)
|
||||
evidence_repo = EvidenceRepository(db)
|
||||
risk_repo = RiskRepository(db)
|
||||
|
||||
ctrl_stats = ctrl_repo.get_statistics()
|
||||
evidence_stats = evidence_repo.get_statistics()
|
||||
risks = risk_repo.get_all()
|
||||
|
||||
total = ctrl_stats.get("total", 0)
|
||||
passing = ctrl_stats.get("pass", 0)
|
||||
partial = ctrl_stats.get("partial", 0)
|
||||
score = round(((passing + partial * 0.5) / total) * 100, 2) if total > 0 else 0
|
||||
|
||||
risks_high = sum(1 for r in risks if (r.inherent_risk.value if r.inherent_risk else "low") in ("high", "critical"))
|
||||
|
||||
today = date.today()
|
||||
|
||||
row = db.execute(text("""
|
||||
INSERT INTO compliance_score_snapshots (
|
||||
tenant_id, score, controls_total, controls_pass, controls_partial,
|
||||
evidence_total, evidence_valid, risks_total, risks_high, snapshot_date
|
||||
) VALUES (
|
||||
:tenant_id, :score, :controls_total, :controls_pass, :controls_partial,
|
||||
:evidence_total, :evidence_valid, :risks_total, :risks_high, :snapshot_date
|
||||
)
|
||||
ON CONFLICT (tenant_id, project_id, snapshot_date) DO UPDATE SET
|
||||
score = EXCLUDED.score,
|
||||
controls_total = EXCLUDED.controls_total,
|
||||
controls_pass = EXCLUDED.controls_pass,
|
||||
controls_partial = EXCLUDED.controls_partial,
|
||||
evidence_total = EXCLUDED.evidence_total,
|
||||
evidence_valid = EXCLUDED.evidence_valid,
|
||||
risks_total = EXCLUDED.risks_total,
|
||||
risks_high = EXCLUDED.risks_high
|
||||
RETURNING *
|
||||
"""), {
|
||||
"tenant_id": tenant_id,
|
||||
"score": score,
|
||||
"controls_total": total,
|
||||
"controls_pass": passing,
|
||||
"controls_partial": partial,
|
||||
"evidence_total": evidence_stats.get("total", 0),
|
||||
"evidence_valid": evidence_stats.get("by_status", {}).get("valid", 0),
|
||||
"risks_total": len(risks),
|
||||
"risks_high": risks_high,
|
||||
"snapshot_date": today,
|
||||
}).fetchone()
|
||||
db.commit()
|
||||
|
||||
return _row_to_dict(row)
|
||||
|
||||
|
||||
@router.get("/dashboard/score-history")
|
||||
async def get_score_history(
|
||||
months: int = Query(12, ge=1, le=36),
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Get compliance score history from snapshots."""
|
||||
since = date.today() - timedelta(days=months * 30)
|
||||
|
||||
rows = db.execute(text("""
|
||||
SELECT * FROM compliance_score_snapshots
|
||||
WHERE tenant_id = :tenant_id AND snapshot_date >= :since
|
||||
ORDER BY snapshot_date ASC
|
||||
"""), {"tenant_id": tenant_id, "since": since}).fetchall()
|
||||
|
||||
snapshots = []
|
||||
for r in rows:
|
||||
d = _row_to_dict(r)
|
||||
# Convert Decimal to float for JSON
|
||||
if isinstance(d.get("score"), Decimal):
|
||||
d["score"] = float(d["score"])
|
||||
snapshots.append(d)
|
||||
|
||||
return {
|
||||
"snapshots": snapshots,
|
||||
"total": len(snapshots),
|
||||
"period_months": months,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Evidence Distribution (Anti-Fake-Evidence Phase 3)
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/dashboard/evidence-distribution")
|
||||
async def get_evidence_distribution(
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Evidence counts by confidence level and four-eyes status."""
|
||||
evidence_repo = EvidenceRepository(db)
|
||||
all_evidence = evidence_repo.get_all()
|
||||
|
||||
by_confidence = {"E0": 0, "E1": 0, "E2": 0, "E3": 0, "E4": 0}
|
||||
four_eyes_pending = 0
|
||||
|
||||
for e in all_evidence:
|
||||
level = e.confidence_level.value if e.confidence_level else "E1"
|
||||
if level in by_confidence:
|
||||
by_confidence[level] += 1
|
||||
if e.requires_four_eyes and e.approval_status not in ("approved", "rejected"):
|
||||
four_eyes_pending += 1
|
||||
|
||||
return {
|
||||
"by_confidence": by_confidence,
|
||||
"four_eyes_pending": four_eyes_pending,
|
||||
"total": len(all_evidence),
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Traceability Matrix (Anti-Fake-Evidence Phase 4a)
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/dashboard/traceability-matrix", response_model=TraceabilityMatrixResponse)
|
||||
async def get_traceability_matrix(
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""
|
||||
Full traceability chain: Control → Evidence → Assertions.
|
||||
|
||||
Loads each entity set once, builds in-memory indices, and nests
|
||||
the result so the frontend can render a matrix view.
|
||||
"""
|
||||
ctrl_repo = ControlRepository(db)
|
||||
evidence_repo = EvidenceRepository(db)
|
||||
|
||||
# 1. Load all three entity sets
|
||||
controls = ctrl_repo.get_all()
|
||||
all_evidence = evidence_repo.get_all()
|
||||
all_assertions = db.query(AssertionDB).filter(
|
||||
AssertionDB.entity_type == "evidence",
|
||||
).all()
|
||||
|
||||
# 2. Index assertions by evidence_id (entity_id)
|
||||
assertions_by_evidence: Dict[str, list] = {}
|
||||
for a in all_assertions:
|
||||
assertions_by_evidence.setdefault(a.entity_id, []).append(a)
|
||||
|
||||
# 3. Index evidence by control_id
|
||||
evidence_by_control: Dict[str, list] = {}
|
||||
for e in all_evidence:
|
||||
evidence_by_control.setdefault(str(e.control_id), []).append(e)
|
||||
|
||||
# 4. Build nested response
|
||||
result_controls: list = []
|
||||
total_controls = 0
|
||||
covered_controls = 0
|
||||
fully_verified = 0
|
||||
|
||||
for ctrl in controls:
|
||||
total_controls += 1
|
||||
ctrl_id = str(ctrl.id)
|
||||
ctrl_evidence = evidence_by_control.get(ctrl_id, [])
|
||||
|
||||
nested_evidence: list = []
|
||||
has_evidence = len(ctrl_evidence) > 0
|
||||
has_assertions = False
|
||||
all_verified = True
|
||||
min_conf: Optional[str] = None
|
||||
conf_order = {"E0": 0, "E1": 1, "E2": 2, "E3": 3, "E4": 4}
|
||||
|
||||
for e in ctrl_evidence:
|
||||
ev_id = str(e.id)
|
||||
ev_assertions = assertions_by_evidence.get(ev_id, [])
|
||||
|
||||
nested_assertions = [
|
||||
TraceabilityAssertion(
|
||||
id=str(a.id),
|
||||
sentence_text=a.sentence_text,
|
||||
assertion_type=a.assertion_type or "assertion",
|
||||
confidence=a.confidence or 0.0,
|
||||
verified=a.verified_by is not None,
|
||||
)
|
||||
for a in ev_assertions
|
||||
]
|
||||
|
||||
if nested_assertions:
|
||||
has_assertions = True
|
||||
for na in nested_assertions:
|
||||
if not na.verified:
|
||||
all_verified = False
|
||||
|
||||
conf = e.confidence_level.value if e.confidence_level else "E1"
|
||||
if min_conf is None or conf_order.get(conf, 1) < conf_order.get(min_conf, 1):
|
||||
min_conf = conf
|
||||
|
||||
nested_evidence.append(TraceabilityEvidence(
|
||||
id=ev_id,
|
||||
title=e.title,
|
||||
evidence_type=e.evidence_type,
|
||||
confidence_level=conf,
|
||||
status=e.status.value if e.status else "valid",
|
||||
assertions=nested_assertions,
|
||||
))
|
||||
|
||||
if not has_assertions:
|
||||
all_verified = False
|
||||
|
||||
if has_evidence:
|
||||
covered_controls += 1
|
||||
if has_evidence and has_assertions and all_verified:
|
||||
fully_verified += 1
|
||||
|
||||
coverage = TraceabilityCoverage(
|
||||
has_evidence=has_evidence,
|
||||
has_assertions=has_assertions,
|
||||
all_assertions_verified=all_verified,
|
||||
min_confidence_level=min_conf,
|
||||
)
|
||||
|
||||
result_controls.append(TraceabilityControl(
|
||||
id=ctrl_id,
|
||||
control_id=ctrl.control_id,
|
||||
title=ctrl.title,
|
||||
status=ctrl.status.value if ctrl.status else "planned",
|
||||
domain=ctrl.domain.value if ctrl.domain else "unknown",
|
||||
evidence=nested_evidence,
|
||||
coverage=coverage,
|
||||
))
|
||||
|
||||
summary = {
|
||||
"total_controls": total_controls,
|
||||
"covered_controls": covered_controls,
|
||||
"fully_verified": fully_verified,
|
||||
"uncovered_controls": total_controls - covered_controls,
|
||||
}
|
||||
|
||||
return TraceabilityMatrixResponse(controls=result_controls, summary=summary)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Reports
|
||||
# ============================================================================
|
||||
|
||||
@@ -60,10 +60,314 @@ def get_dsfa_service(db: Session = Depends(get_db)) -> DSFAService:
|
||||
return DSFAService(db)
|
||||
|
||||
|
||||
def get_workflow_service(
|
||||
db: Session = Depends(get_db),
|
||||
) -> DSFAWorkflowService:
|
||||
return DSFAWorkflowService(db)
|
||||
# =============================================================================
|
||||
# Pydantic Schemas
|
||||
# =============================================================================
|
||||
|
||||
class DSFACreate(BaseModel):
|
||||
title: str
|
||||
description: str = ""
|
||||
status: str = "draft"
|
||||
risk_level: str = "low"
|
||||
processing_activity: str = ""
|
||||
data_categories: List[str] = []
|
||||
recipients: List[str] = []
|
||||
measures: List[str] = []
|
||||
created_by: str = "system"
|
||||
# Section 1
|
||||
processing_description: Optional[str] = None
|
||||
processing_purpose: Optional[str] = None
|
||||
legal_basis: Optional[str] = None
|
||||
legal_basis_details: Optional[str] = None
|
||||
# Section 2
|
||||
necessity_assessment: Optional[str] = None
|
||||
proportionality_assessment: Optional[str] = None
|
||||
data_minimization: Optional[str] = None
|
||||
alternatives_considered: Optional[str] = None
|
||||
retention_justification: Optional[str] = None
|
||||
# Section 3
|
||||
involves_ai: Optional[bool] = None
|
||||
overall_risk_level: Optional[str] = None
|
||||
risk_score: Optional[int] = None
|
||||
# Section 6
|
||||
dpo_consulted: Optional[bool] = None
|
||||
dpo_name: Optional[str] = None
|
||||
dpo_opinion: Optional[str] = None
|
||||
dpo_approved: Optional[bool] = None
|
||||
authority_consulted: Optional[bool] = None
|
||||
authority_reference: Optional[str] = None
|
||||
authority_decision: Optional[str] = None
|
||||
# Metadata
|
||||
version: Optional[int] = None
|
||||
conclusion: Optional[str] = None
|
||||
federal_state: Optional[str] = None
|
||||
authority_resource_id: Optional[str] = None
|
||||
submitted_by: Optional[str] = None
|
||||
# JSONB Arrays
|
||||
data_subjects: Optional[List[str]] = None
|
||||
affected_rights: Optional[List[str]] = None
|
||||
triggered_rule_codes: Optional[List[str]] = None
|
||||
ai_trigger_ids: Optional[List[str]] = None
|
||||
wp248_criteria_met: Optional[List[str]] = None
|
||||
art35_abs3_triggered: Optional[List[str]] = None
|
||||
tom_references: Optional[List[str]] = None
|
||||
risks: Optional[List[dict]] = None
|
||||
mitigations: Optional[List[dict]] = None
|
||||
stakeholder_consultations: Optional[List[dict]] = None
|
||||
review_triggers: Optional[List[dict]] = None
|
||||
review_comments: Optional[List[dict]] = None
|
||||
ai_use_case_modules: Optional[List[dict]] = None
|
||||
section_8_complete: Optional[bool] = None
|
||||
# JSONB Objects
|
||||
threshold_analysis: Optional[dict] = None
|
||||
consultation_requirement: Optional[dict] = None
|
||||
review_schedule: Optional[dict] = None
|
||||
section_progress: Optional[dict] = None
|
||||
metadata: Optional[dict] = None
|
||||
|
||||
|
||||
class DSFAUpdate(BaseModel):
|
||||
title: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
status: Optional[str] = None
|
||||
risk_level: Optional[str] = None
|
||||
processing_activity: Optional[str] = None
|
||||
data_categories: Optional[List[str]] = None
|
||||
recipients: Optional[List[str]] = None
|
||||
measures: Optional[List[str]] = None
|
||||
approved_by: Optional[str] = None
|
||||
# Section 1
|
||||
processing_description: Optional[str] = None
|
||||
processing_purpose: Optional[str] = None
|
||||
legal_basis: Optional[str] = None
|
||||
legal_basis_details: Optional[str] = None
|
||||
# Section 2
|
||||
necessity_assessment: Optional[str] = None
|
||||
proportionality_assessment: Optional[str] = None
|
||||
data_minimization: Optional[str] = None
|
||||
alternatives_considered: Optional[str] = None
|
||||
retention_justification: Optional[str] = None
|
||||
# Section 3
|
||||
involves_ai: Optional[bool] = None
|
||||
overall_risk_level: Optional[str] = None
|
||||
risk_score: Optional[int] = None
|
||||
# Section 6
|
||||
dpo_consulted: Optional[bool] = None
|
||||
dpo_name: Optional[str] = None
|
||||
dpo_opinion: Optional[str] = None
|
||||
dpo_approved: Optional[bool] = None
|
||||
authority_consulted: Optional[bool] = None
|
||||
authority_reference: Optional[str] = None
|
||||
authority_decision: Optional[str] = None
|
||||
# Metadata
|
||||
version: Optional[int] = None
|
||||
conclusion: Optional[str] = None
|
||||
federal_state: Optional[str] = None
|
||||
authority_resource_id: Optional[str] = None
|
||||
submitted_by: Optional[str] = None
|
||||
# JSONB Arrays
|
||||
data_subjects: Optional[List[str]] = None
|
||||
affected_rights: Optional[List[str]] = None
|
||||
triggered_rule_codes: Optional[List[str]] = None
|
||||
ai_trigger_ids: Optional[List[str]] = None
|
||||
wp248_criteria_met: Optional[List[str]] = None
|
||||
art35_abs3_triggered: Optional[List[str]] = None
|
||||
tom_references: Optional[List[str]] = None
|
||||
risks: Optional[List[dict]] = None
|
||||
mitigations: Optional[List[dict]] = None
|
||||
stakeholder_consultations: Optional[List[dict]] = None
|
||||
review_triggers: Optional[List[dict]] = None
|
||||
review_comments: Optional[List[dict]] = None
|
||||
ai_use_case_modules: Optional[List[dict]] = None
|
||||
section_8_complete: Optional[bool] = None
|
||||
# JSONB Objects
|
||||
threshold_analysis: Optional[dict] = None
|
||||
consultation_requirement: Optional[dict] = None
|
||||
review_schedule: Optional[dict] = None
|
||||
section_progress: Optional[dict] = None
|
||||
metadata: Optional[dict] = None
|
||||
|
||||
|
||||
class DSFAStatusUpdate(BaseModel):
|
||||
status: str
|
||||
approved_by: Optional[str] = None
|
||||
|
||||
|
||||
class DSFASectionUpdate(BaseModel):
|
||||
"""Body for PUT /dsfa/{id}/sections/{section_number}."""
|
||||
content: Optional[str] = None
|
||||
# Allow arbitrary extra fields so the frontend can send any section-specific data
|
||||
extra: Optional[dict] = None
|
||||
|
||||
|
||||
class DSFAApproveRequest(BaseModel):
|
||||
"""Body for POST /dsfa/{id}/approve."""
|
||||
approved: bool
|
||||
comments: Optional[str] = None
|
||||
approved_by: Optional[str] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helpers
|
||||
# =============================================================================
|
||||
|
||||
def _get_tenant_id(tenant_id: Optional[str]) -> str:
|
||||
return tenant_id or DEFAULT_TENANT_ID
|
||||
|
||||
|
||||
def _dsfa_to_response(row) -> dict:
|
||||
"""Convert a DB row to a JSON-serializable dict."""
|
||||
import json
|
||||
# SQLAlchemy 2.0: Row objects need ._mapping for string-key access
|
||||
if hasattr(row, "_mapping"):
|
||||
row = row._mapping
|
||||
|
||||
def _parse_arr(val):
|
||||
"""Parse a JSONB array field → list."""
|
||||
if val is None:
|
||||
return []
|
||||
if isinstance(val, list):
|
||||
return val
|
||||
if isinstance(val, str):
|
||||
try:
|
||||
parsed = json.loads(val)
|
||||
return parsed if isinstance(parsed, list) else []
|
||||
except Exception:
|
||||
return []
|
||||
return val
|
||||
|
||||
def _parse_obj(val):
|
||||
"""Parse a JSONB object field → dict."""
|
||||
if val is None:
|
||||
return {}
|
||||
if isinstance(val, dict):
|
||||
return val
|
||||
if isinstance(val, str):
|
||||
try:
|
||||
parsed = json.loads(val)
|
||||
return parsed if isinstance(parsed, dict) else {}
|
||||
except Exception:
|
||||
return {}
|
||||
return val
|
||||
|
||||
def _ts(val):
|
||||
"""Timestamp → ISO string or None."""
|
||||
if not val:
|
||||
return None
|
||||
if isinstance(val, str):
|
||||
return val
|
||||
return val.isoformat()
|
||||
|
||||
def _get(key, default=None):
|
||||
"""Safe row access — returns default if key missing (handles old rows)."""
|
||||
try:
|
||||
v = row[key]
|
||||
return default if v is None and default is not None else v
|
||||
except (KeyError, IndexError):
|
||||
return default
|
||||
|
||||
return {
|
||||
# Core fields (always present since Migration 024)
|
||||
"id": str(row["id"]),
|
||||
"tenant_id": row["tenant_id"],
|
||||
"title": row["title"],
|
||||
"description": row["description"] or "",
|
||||
"status": row["status"] or "draft",
|
||||
"risk_level": row["risk_level"] or "low",
|
||||
"processing_activity": row["processing_activity"] or "",
|
||||
"data_categories": _parse_arr(row["data_categories"]),
|
||||
"recipients": _parse_arr(row["recipients"]),
|
||||
"measures": _parse_arr(row["measures"]),
|
||||
"approved_by": row["approved_by"],
|
||||
"approved_at": _ts(row["approved_at"]),
|
||||
"created_by": row["created_by"] or "system",
|
||||
"created_at": _ts(row["created_at"]),
|
||||
"updated_at": _ts(row["updated_at"]),
|
||||
# Section 1 (Migration 030)
|
||||
"processing_description": _get("processing_description"),
|
||||
"processing_purpose": _get("processing_purpose"),
|
||||
"legal_basis": _get("legal_basis"),
|
||||
"legal_basis_details": _get("legal_basis_details"),
|
||||
# Section 2
|
||||
"necessity_assessment": _get("necessity_assessment"),
|
||||
"proportionality_assessment": _get("proportionality_assessment"),
|
||||
"data_minimization": _get("data_minimization"),
|
||||
"alternatives_considered": _get("alternatives_considered"),
|
||||
"retention_justification": _get("retention_justification"),
|
||||
# Section 3
|
||||
"involves_ai": _get("involves_ai", False),
|
||||
"overall_risk_level": _get("overall_risk_level"),
|
||||
"risk_score": _get("risk_score", 0),
|
||||
# Section 6
|
||||
"dpo_consulted": _get("dpo_consulted", False),
|
||||
"dpo_consulted_at": _ts(_get("dpo_consulted_at")),
|
||||
"dpo_name": _get("dpo_name"),
|
||||
"dpo_opinion": _get("dpo_opinion"),
|
||||
"dpo_approved": _get("dpo_approved"),
|
||||
"authority_consulted": _get("authority_consulted", False),
|
||||
"authority_consulted_at": _ts(_get("authority_consulted_at")),
|
||||
"authority_reference": _get("authority_reference"),
|
||||
"authority_decision": _get("authority_decision"),
|
||||
# Metadata / Versioning
|
||||
"version": _get("version", 1),
|
||||
"previous_version_id": str(_get("previous_version_id")) if _get("previous_version_id") else None,
|
||||
"conclusion": _get("conclusion"),
|
||||
"federal_state": _get("federal_state"),
|
||||
"authority_resource_id": _get("authority_resource_id"),
|
||||
"submitted_for_review_at": _ts(_get("submitted_for_review_at")),
|
||||
"submitted_by": _get("submitted_by"),
|
||||
# JSONB Arrays
|
||||
"data_subjects": _parse_arr(_get("data_subjects")),
|
||||
"affected_rights": _parse_arr(_get("affected_rights")),
|
||||
"triggered_rule_codes": _parse_arr(_get("triggered_rule_codes")),
|
||||
"ai_trigger_ids": _parse_arr(_get("ai_trigger_ids")),
|
||||
"wp248_criteria_met": _parse_arr(_get("wp248_criteria_met")),
|
||||
"art35_abs3_triggered": _parse_arr(_get("art35_abs3_triggered")),
|
||||
"tom_references": _parse_arr(_get("tom_references")),
|
||||
"risks": _parse_arr(_get("risks")),
|
||||
"mitigations": _parse_arr(_get("mitigations")),
|
||||
"stakeholder_consultations": _parse_arr(_get("stakeholder_consultations")),
|
||||
"review_triggers": _parse_arr(_get("review_triggers")),
|
||||
"review_comments": _parse_arr(_get("review_comments")),
|
||||
# Section 8 / AI (Migration 028)
|
||||
"ai_use_case_modules": _parse_arr(_get("ai_use_case_modules")),
|
||||
"section_8_complete": _get("section_8_complete", False),
|
||||
# JSONB Objects
|
||||
"threshold_analysis": _parse_obj(_get("threshold_analysis")),
|
||||
"consultation_requirement": _parse_obj(_get("consultation_requirement")),
|
||||
"review_schedule": _parse_obj(_get("review_schedule")),
|
||||
"section_progress": _parse_obj(_get("section_progress")),
|
||||
"metadata": _parse_obj(_get("metadata")),
|
||||
}
|
||||
|
||||
|
||||
def _log_audit(
|
||||
db: Session,
|
||||
tenant_id: str,
|
||||
dsfa_id,
|
||||
action: str,
|
||||
changed_by: str = "system",
|
||||
old_values=None,
|
||||
new_values=None,
|
||||
):
|
||||
import json
|
||||
db.execute(
|
||||
text("""
|
||||
INSERT INTO compliance_dsfa_audit_log
|
||||
(tenant_id, dsfa_id, action, changed_by, old_values, new_values)
|
||||
VALUES
|
||||
(:tenant_id, :dsfa_id, :action, :changed_by,
|
||||
CAST(:old_values AS jsonb), CAST(:new_values AS jsonb))
|
||||
"""),
|
||||
{
|
||||
"tenant_id": tenant_id,
|
||||
"dsfa_id": str(dsfa_id) if dsfa_id else None,
|
||||
"action": action,
|
||||
"changed_by": changed_by,
|
||||
"old_values": json.dumps(old_values) if old_values else None,
|
||||
"new_values": json.dumps(new_values) if new_values else None,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -177,8 +481,51 @@ async def create_dsfa(
|
||||
service: DSFAService = Depends(get_dsfa_service),
|
||||
) -> dict[str, Any]:
|
||||
"""Neue DSFA erstellen."""
|
||||
with translate_domain_errors():
|
||||
return service.create(tenant_id, request)
|
||||
import json
|
||||
|
||||
if request.status not in VALID_STATUSES:
|
||||
raise HTTPException(status_code=422, detail=f"Ungültiger Status: {request.status}")
|
||||
if request.risk_level not in VALID_RISK_LEVELS:
|
||||
raise HTTPException(status_code=422, detail=f"Ungültiges Risiko-Level: {request.risk_level}")
|
||||
|
||||
tid = _get_tenant_id(tenant_id)
|
||||
|
||||
row = db.execute(
|
||||
text("""
|
||||
INSERT INTO compliance_dsfas
|
||||
(tenant_id, title, description, status, risk_level,
|
||||
processing_activity, data_categories, recipients, measures, created_by)
|
||||
VALUES
|
||||
(:tenant_id, :title, :description, :status, :risk_level,
|
||||
:processing_activity,
|
||||
CAST(:data_categories AS jsonb),
|
||||
CAST(:recipients AS jsonb),
|
||||
CAST(:measures AS jsonb),
|
||||
:created_by)
|
||||
RETURNING *
|
||||
"""),
|
||||
{
|
||||
"tenant_id": tid,
|
||||
"title": request.title,
|
||||
"description": request.description,
|
||||
"status": request.status,
|
||||
"risk_level": request.risk_level,
|
||||
"processing_activity": request.processing_activity,
|
||||
"data_categories": json.dumps(request.data_categories),
|
||||
"recipients": json.dumps(request.recipients),
|
||||
"measures": json.dumps(request.measures),
|
||||
"created_by": request.created_by,
|
||||
},
|
||||
).fetchone()
|
||||
|
||||
db.flush()
|
||||
row_id = row._mapping["id"] if hasattr(row, "_mapping") else row[0]
|
||||
_log_audit(
|
||||
db, tid, row_id, "CREATE", request.created_by,
|
||||
new_values={"title": request.title, "status": request.status},
|
||||
)
|
||||
db.commit()
|
||||
return _dsfa_to_response(row)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
1151
backend-compliance/compliance/api/evidence_check_routes.py
Normal file
1151
backend-compliance/compliance/api/evidence_check_routes.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -22,23 +22,21 @@ from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
from compliance.api._http_errors import translate_domain_errors
|
||||
from compliance.db import ControlRepository, EvidenceRepository
|
||||
from compliance.schemas.evidence import (
|
||||
EvidenceCreate,
|
||||
EvidenceListResponse,
|
||||
EvidenceResponse,
|
||||
|
||||
from ..db import (
|
||||
ControlRepository,
|
||||
EvidenceRepository,
|
||||
EvidenceStatusEnum,
|
||||
EvidenceConfidenceEnum,
|
||||
EvidenceTruthStatusEnum,
|
||||
)
|
||||
from compliance.services.auto_risk_updater import AutoRiskUpdater
|
||||
from compliance.domain import NotFoundError, ValidationError
|
||||
from compliance.services.evidence_service import (
|
||||
SOURCE_CONTROL_MAP,
|
||||
EvidenceService,
|
||||
_extract_findings_detail, # re-exported for legacy test imports
|
||||
_parse_ci_evidence, # re-exported for legacy test imports
|
||||
_store_evidence, # re-exported for legacy test imports
|
||||
_update_risks as _update_risks_impl,
|
||||
from ..db.models import EvidenceDB, ControlDB, AuditTrailDB
|
||||
from ..services.auto_risk_updater import AutoRiskUpdater
|
||||
from .schemas import (
|
||||
EvidenceCreate, EvidenceResponse, EvidenceListResponse,
|
||||
EvidenceRejectRequest,
|
||||
)
|
||||
from .audit_trail_utils import log_audit_trail
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["compliance-evidence"])
|
||||
@@ -56,7 +54,88 @@ def get_evidence_service(db: Session = Depends(get_db)) -> EvidenceService:
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Evidence CRUD
|
||||
# Anti-Fake-Evidence: Four-Eyes Domain Check
|
||||
# ============================================================================
|
||||
|
||||
FOUR_EYES_DOMAINS = {"gov", "priv"}
|
||||
|
||||
|
||||
def _requires_four_eyes(control_domain: str) -> bool:
|
||||
"""Controls in governance/privacy domains require two independent reviewers."""
|
||||
return control_domain in FOUR_EYES_DOMAINS
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Anti-Fake-Evidence: Auto-Classification Helpers
|
||||
# ============================================================================
|
||||
|
||||
def _classify_confidence(source: Optional[str], evidence_type: Optional[str] = None, artifact_hash: Optional[str] = None) -> EvidenceConfidenceEnum:
|
||||
"""Classify evidence confidence level based on source and metadata."""
|
||||
if source == "ci_pipeline":
|
||||
return EvidenceConfidenceEnum.E3
|
||||
if source == "api" and artifact_hash:
|
||||
return EvidenceConfidenceEnum.E3
|
||||
if source == "api":
|
||||
return EvidenceConfidenceEnum.E3
|
||||
if source in ("manual", "upload"):
|
||||
return EvidenceConfidenceEnum.E1
|
||||
if source == "generated":
|
||||
return EvidenceConfidenceEnum.E0
|
||||
# Default for unknown sources
|
||||
return EvidenceConfidenceEnum.E1
|
||||
|
||||
|
||||
def _classify_truth_status(source: Optional[str]) -> EvidenceTruthStatusEnum:
|
||||
"""Classify evidence truth status based on source."""
|
||||
if source == "ci_pipeline":
|
||||
return EvidenceTruthStatusEnum.OBSERVED
|
||||
if source in ("manual", "upload"):
|
||||
return EvidenceTruthStatusEnum.UPLOADED
|
||||
if source == "generated":
|
||||
return EvidenceTruthStatusEnum.GENERATED
|
||||
if source == "api":
|
||||
return EvidenceTruthStatusEnum.OBSERVED
|
||||
return EvidenceTruthStatusEnum.UPLOADED
|
||||
|
||||
|
||||
def _build_evidence_response(e: EvidenceDB) -> EvidenceResponse:
|
||||
"""Build an EvidenceResponse from an EvidenceDB, including anti-fake fields."""
|
||||
return EvidenceResponse(
|
||||
id=e.id,
|
||||
control_id=e.control_id,
|
||||
evidence_type=e.evidence_type,
|
||||
title=e.title,
|
||||
description=e.description,
|
||||
artifact_path=e.artifact_path,
|
||||
artifact_url=e.artifact_url,
|
||||
artifact_hash=e.artifact_hash,
|
||||
file_size_bytes=e.file_size_bytes,
|
||||
mime_type=e.mime_type,
|
||||
valid_from=e.valid_from,
|
||||
valid_until=e.valid_until,
|
||||
status=e.status.value if e.status else None,
|
||||
source=e.source,
|
||||
ci_job_id=e.ci_job_id,
|
||||
uploaded_by=e.uploaded_by,
|
||||
collected_at=e.collected_at,
|
||||
created_at=e.created_at,
|
||||
confidence_level=e.confidence_level.value if e.confidence_level else None,
|
||||
truth_status=e.truth_status.value if e.truth_status else None,
|
||||
generation_mode=e.generation_mode,
|
||||
may_be_used_as_evidence=e.may_be_used_as_evidence,
|
||||
reviewed_by=e.reviewed_by,
|
||||
reviewed_at=e.reviewed_at,
|
||||
approval_status=e.approval_status,
|
||||
first_reviewer=e.first_reviewer,
|
||||
first_reviewed_at=e.first_reviewed_at,
|
||||
second_reviewer=e.second_reviewer,
|
||||
second_reviewed_at=e.second_reviewed_at,
|
||||
requires_four_eyes=e.requires_four_eyes,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Evidence
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/evidence", response_model=EvidenceListResponse)
|
||||
@@ -69,8 +148,38 @@ async def list_evidence(
|
||||
service: EvidenceService = Depends(get_evidence_service),
|
||||
) -> EvidenceListResponse:
|
||||
"""List evidence with optional filters and pagination."""
|
||||
with translate_domain_errors():
|
||||
return service.list_evidence(control_id, evidence_type, status, page, limit)
|
||||
repo = EvidenceRepository(db)
|
||||
|
||||
if control_id:
|
||||
# First get the control UUID
|
||||
ctrl_repo = ControlRepository(db)
|
||||
control = ctrl_repo.get_by_control_id(control_id)
|
||||
if not control:
|
||||
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
|
||||
evidence = repo.get_by_control(control.id)
|
||||
else:
|
||||
evidence = repo.get_all()
|
||||
|
||||
if evidence_type:
|
||||
evidence = [e for e in evidence if e.evidence_type == evidence_type]
|
||||
|
||||
if status:
|
||||
try:
|
||||
status_enum = EvidenceStatusEnum(status)
|
||||
evidence = [e for e in evidence if e.status == status_enum]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
total = len(evidence)
|
||||
|
||||
# Apply pagination if requested
|
||||
if page is not None and limit is not None:
|
||||
offset = (page - 1) * limit
|
||||
evidence = evidence[offset:offset + limit]
|
||||
|
||||
results = [_build_evidence_response(e) for e in evidence]
|
||||
|
||||
return EvidenceListResponse(evidence=results, total=total)
|
||||
|
||||
|
||||
@router.post("/evidence", response_model=EvidenceResponse)
|
||||
@@ -79,8 +188,66 @@ async def create_evidence(
|
||||
service: EvidenceService = Depends(get_evidence_service),
|
||||
) -> EvidenceResponse:
|
||||
"""Create new evidence record."""
|
||||
with translate_domain_errors():
|
||||
return service.create_evidence(evidence_data)
|
||||
repo = EvidenceRepository(db)
|
||||
|
||||
# Get control UUID
|
||||
ctrl_repo = ControlRepository(db)
|
||||
control = ctrl_repo.get_by_control_id(evidence_data.control_id)
|
||||
if not control:
|
||||
raise HTTPException(status_code=404, detail=f"Control {evidence_data.control_id} not found")
|
||||
|
||||
source = evidence_data.source or "api"
|
||||
confidence = _classify_confidence(source, evidence_data.evidence_type)
|
||||
truth = _classify_truth_status(source)
|
||||
|
||||
# Allow explicit override from request
|
||||
if evidence_data.confidence_level:
|
||||
try:
|
||||
confidence = EvidenceConfidenceEnum(evidence_data.confidence_level)
|
||||
except ValueError:
|
||||
pass
|
||||
if evidence_data.truth_status:
|
||||
try:
|
||||
truth = EvidenceTruthStatusEnum(evidence_data.truth_status)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
evidence = repo.create(
|
||||
control_id=control.id,
|
||||
evidence_type=evidence_data.evidence_type,
|
||||
title=evidence_data.title,
|
||||
description=evidence_data.description,
|
||||
artifact_url=evidence_data.artifact_url,
|
||||
valid_from=evidence_data.valid_from,
|
||||
valid_until=evidence_data.valid_until,
|
||||
source=source,
|
||||
ci_job_id=evidence_data.ci_job_id,
|
||||
)
|
||||
|
||||
# Set anti-fake-evidence fields
|
||||
evidence.confidence_level = confidence
|
||||
evidence.truth_status = truth
|
||||
# Generated evidence should not be used as evidence by default
|
||||
if truth == EvidenceTruthStatusEnum.GENERATED:
|
||||
evidence.may_be_used_as_evidence = False
|
||||
|
||||
# Four-Eyes: check if the linked control's domain requires it
|
||||
control_domain = control.domain.value if control.domain else ""
|
||||
if _requires_four_eyes(control_domain):
|
||||
evidence.requires_four_eyes = True
|
||||
evidence.approval_status = "pending_first"
|
||||
|
||||
db.commit()
|
||||
|
||||
# Audit trail
|
||||
log_audit_trail(
|
||||
db, "evidence", evidence.id, evidence.title, "create",
|
||||
performed_by=evidence_data.source or "api",
|
||||
change_summary=f"Evidence created with confidence={confidence.value}, truth={truth.value}",
|
||||
)
|
||||
db.commit()
|
||||
|
||||
return _build_evidence_response(evidence)
|
||||
|
||||
|
||||
@router.delete("/evidence/{evidence_id}")
|
||||
@@ -107,9 +274,271 @@ async def upload_evidence(
|
||||
service: EvidenceService = Depends(get_evidence_service),
|
||||
) -> EvidenceResponse:
|
||||
"""Upload evidence file."""
|
||||
with translate_domain_errors():
|
||||
return await service.upload_evidence(
|
||||
control_id, evidence_type, title, file, description
|
||||
# Get control UUID
|
||||
ctrl_repo = ControlRepository(db)
|
||||
control = ctrl_repo.get_by_control_id(control_id)
|
||||
if not control:
|
||||
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
|
||||
|
||||
# Create upload directory
|
||||
upload_dir = f"/tmp/compliance_evidence/{control_id}"
|
||||
os.makedirs(upload_dir, exist_ok=True)
|
||||
|
||||
# Save file
|
||||
file_path = os.path.join(upload_dir, file.filename)
|
||||
content = await file.read()
|
||||
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
# Calculate hash
|
||||
file_hash = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Create evidence record
|
||||
repo = EvidenceRepository(db)
|
||||
evidence = repo.create(
|
||||
control_id=control.id,
|
||||
evidence_type=evidence_type,
|
||||
title=title,
|
||||
description=description,
|
||||
artifact_path=file_path,
|
||||
artifact_hash=file_hash,
|
||||
file_size_bytes=len(content),
|
||||
mime_type=file.content_type,
|
||||
source="upload",
|
||||
)
|
||||
|
||||
# Upload evidence → E1 + uploaded
|
||||
evidence.confidence_level = EvidenceConfidenceEnum.E1
|
||||
evidence.truth_status = EvidenceTruthStatusEnum.UPLOADED
|
||||
|
||||
# Four-Eyes: check if the linked control's domain requires it
|
||||
control_domain = control.domain.value if control.domain else ""
|
||||
if _requires_four_eyes(control_domain):
|
||||
evidence.requires_four_eyes = True
|
||||
evidence.approval_status = "pending_first"
|
||||
|
||||
db.commit()
|
||||
|
||||
return _build_evidence_response(evidence)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# CI/CD Evidence Collection — helpers
|
||||
# ============================================================================
|
||||
|
||||
# Map CI source names to the corresponding control IDs
|
||||
SOURCE_CONTROL_MAP = {
|
||||
"sast": "SDLC-001",
|
||||
"dependency_scan": "SDLC-002",
|
||||
"secret_scan": "SDLC-003",
|
||||
"code_review": "SDLC-004",
|
||||
"sbom": "SDLC-005",
|
||||
"container_scan": "SDLC-006",
|
||||
"test_results": "AUD-001",
|
||||
}
|
||||
|
||||
|
||||
def _parse_ci_evidence(data: dict) -> dict:
|
||||
"""
|
||||
Parse and validate incoming CI evidence data.
|
||||
|
||||
Returns a dict with:
|
||||
- report_json: str (serialised JSON)
|
||||
- report_hash: str (SHA-256 hex digest)
|
||||
- evidence_status: str ("valid" or "failed")
|
||||
- findings_count: int
|
||||
- critical_findings: int
|
||||
"""
|
||||
report_json = json.dumps(data) if data else "{}"
|
||||
report_hash = hashlib.sha256(report_json.encode()).hexdigest()
|
||||
|
||||
findings_count = 0
|
||||
critical_findings = 0
|
||||
|
||||
if data and isinstance(data, dict):
|
||||
# Semgrep format
|
||||
if "results" in data:
|
||||
findings_count = len(data.get("results", []))
|
||||
critical_findings = len([
|
||||
r for r in data.get("results", [])
|
||||
if r.get("extra", {}).get("severity", "").upper() in ["CRITICAL", "HIGH"]
|
||||
])
|
||||
|
||||
# Trivy format
|
||||
elif "Results" in data:
|
||||
for result in data.get("Results", []):
|
||||
vulns = result.get("Vulnerabilities", [])
|
||||
findings_count += len(vulns)
|
||||
critical_findings += len([
|
||||
v for v in vulns
|
||||
if v.get("Severity", "").upper() in ["CRITICAL", "HIGH"]
|
||||
])
|
||||
|
||||
# Generic findings array
|
||||
elif "findings" in data:
|
||||
findings_count = len(data.get("findings", []))
|
||||
|
||||
# SBOM format - just count components
|
||||
elif "components" in data:
|
||||
findings_count = len(data.get("components", []))
|
||||
|
||||
evidence_status = "failed" if critical_findings > 0 else "valid"
|
||||
|
||||
return {
|
||||
"report_json": report_json,
|
||||
"report_hash": report_hash,
|
||||
"evidence_status": evidence_status,
|
||||
"findings_count": findings_count,
|
||||
"critical_findings": critical_findings,
|
||||
}
|
||||
|
||||
|
||||
def _store_evidence(
|
||||
db: Session,
|
||||
*,
|
||||
control_db_id: str,
|
||||
source: str,
|
||||
parsed: dict,
|
||||
ci_job_id: str,
|
||||
ci_job_url: str,
|
||||
report_data: dict,
|
||||
) -> EvidenceDB:
|
||||
"""
|
||||
Persist a CI evidence item to the database and write the report file.
|
||||
|
||||
Returns the created EvidenceDB instance (already committed).
|
||||
"""
|
||||
findings_count = parsed["findings_count"]
|
||||
critical_findings = parsed["critical_findings"]
|
||||
|
||||
# Build title and description
|
||||
title = f"{source.upper()} Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}"
|
||||
description = "Automatically collected from CI/CD pipeline"
|
||||
if findings_count > 0:
|
||||
description += f"\n- Total findings: {findings_count}"
|
||||
if critical_findings > 0:
|
||||
description += f"\n- Critical/High findings: {critical_findings}"
|
||||
if ci_job_id:
|
||||
description += f"\n- CI Job ID: {ci_job_id}"
|
||||
if ci_job_url:
|
||||
description += f"\n- CI Job URL: {ci_job_url}"
|
||||
|
||||
# Store report file
|
||||
upload_dir = f"/tmp/compliance_evidence/ci/{source}"
|
||||
os.makedirs(upload_dir, exist_ok=True)
|
||||
file_name = f"{source}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{parsed['report_hash'][:8]}.json"
|
||||
file_path = os.path.join(upload_dir, file_name)
|
||||
|
||||
with open(file_path, "w") as f:
|
||||
json.dump(report_data or {}, f, indent=2)
|
||||
|
||||
# Create evidence record with anti-fake-evidence classification
|
||||
evidence = EvidenceDB(
|
||||
id=str(uuid_module.uuid4()),
|
||||
control_id=control_db_id,
|
||||
evidence_type=f"ci_{source}",
|
||||
title=title,
|
||||
description=description,
|
||||
artifact_path=file_path,
|
||||
artifact_hash=parsed["report_hash"],
|
||||
file_size_bytes=len(parsed["report_json"]),
|
||||
mime_type="application/json",
|
||||
source="ci_pipeline",
|
||||
ci_job_id=ci_job_id,
|
||||
valid_from=datetime.utcnow(),
|
||||
valid_until=datetime.utcnow() + timedelta(days=90),
|
||||
status=EvidenceStatusEnum(parsed["evidence_status"]),
|
||||
# CI pipeline evidence → E3 observed (system-observed, hash-verified)
|
||||
confidence_level=EvidenceConfidenceEnum.E3,
|
||||
truth_status=EvidenceTruthStatusEnum.OBSERVED,
|
||||
may_be_used_as_evidence=True,
|
||||
)
|
||||
db.add(evidence)
|
||||
db.commit()
|
||||
db.refresh(evidence)
|
||||
|
||||
return evidence
|
||||
|
||||
|
||||
def _extract_findings_detail(report_data: dict) -> dict:
|
||||
"""
|
||||
Extract severity-bucketed finding counts from report data.
|
||||
|
||||
Returns dict with keys: critical, high, medium, low.
|
||||
"""
|
||||
findings_detail = {
|
||||
"critical": 0,
|
||||
"high": 0,
|
||||
"medium": 0,
|
||||
"low": 0,
|
||||
}
|
||||
|
||||
if not report_data:
|
||||
return findings_detail
|
||||
|
||||
# Semgrep format
|
||||
if "results" in report_data:
|
||||
for r in report_data.get("results", []):
|
||||
severity = r.get("extra", {}).get("severity", "").upper()
|
||||
if severity == "CRITICAL":
|
||||
findings_detail["critical"] += 1
|
||||
elif severity == "HIGH":
|
||||
findings_detail["high"] += 1
|
||||
elif severity == "MEDIUM":
|
||||
findings_detail["medium"] += 1
|
||||
elif severity in ["LOW", "INFO"]:
|
||||
findings_detail["low"] += 1
|
||||
|
||||
# Trivy format
|
||||
elif "Results" in report_data:
|
||||
for result in report_data.get("Results", []):
|
||||
for v in result.get("Vulnerabilities", []):
|
||||
severity = v.get("Severity", "").upper()
|
||||
if severity == "CRITICAL":
|
||||
findings_detail["critical"] += 1
|
||||
elif severity == "HIGH":
|
||||
findings_detail["high"] += 1
|
||||
elif severity == "MEDIUM":
|
||||
findings_detail["medium"] += 1
|
||||
elif severity == "LOW":
|
||||
findings_detail["low"] += 1
|
||||
|
||||
# Generic findings with severity
|
||||
elif "findings" in report_data:
|
||||
for f in report_data.get("findings", []):
|
||||
severity = f.get("severity", "").upper()
|
||||
if severity == "CRITICAL":
|
||||
findings_detail["critical"] += 1
|
||||
elif severity == "HIGH":
|
||||
findings_detail["high"] += 1
|
||||
elif severity == "MEDIUM":
|
||||
findings_detail["medium"] += 1
|
||||
else:
|
||||
findings_detail["low"] += 1
|
||||
|
||||
return findings_detail
|
||||
|
||||
|
||||
def _update_risks(db: Session, *, source: str, control_id: str, ci_job_id: str, report_data: dict):
|
||||
"""
|
||||
Update risk status based on new evidence.
|
||||
|
||||
Uses AutoRiskUpdater to update Control status and linked Risks based on
|
||||
severity-bucketed findings. Returns the update result or None on error.
|
||||
"""
|
||||
findings_detail = _extract_findings_detail(report_data)
|
||||
|
||||
try:
|
||||
auto_updater = AutoRiskUpdater(db)
|
||||
risk_update_result = auto_updater.process_evidence_collect_request(
|
||||
tool=source,
|
||||
control_id=control_id,
|
||||
evidence_type=f"ci_{source}",
|
||||
timestamp=datetime.utcnow().isoformat(),
|
||||
commit_sha=report_data.get("commit_sha", "unknown") if report_data else "unknown",
|
||||
ci_job_id=ci_job_id,
|
||||
findings=findings_detail,
|
||||
)
|
||||
|
||||
|
||||
@@ -227,14 +656,229 @@ async def get_ci_evidence_status(
|
||||
# Legacy re-exports for tests that import helpers directly.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
__all__ = [
|
||||
"router",
|
||||
"SOURCE_CONTROL_MAP",
|
||||
"EvidenceRepository",
|
||||
"ControlRepository",
|
||||
"AutoRiskUpdater",
|
||||
"_parse_ci_evidence",
|
||||
"_extract_findings_detail",
|
||||
"_store_evidence",
|
||||
"_update_risks",
|
||||
]
|
||||
if control_id:
|
||||
ctrl_repo = ControlRepository(db)
|
||||
control = ctrl_repo.get_by_control_id(control_id)
|
||||
if control:
|
||||
query = query.filter(EvidenceDB.control_id == control.id)
|
||||
|
||||
evidence_list = query.order_by(EvidenceDB.collected_at.desc()).limit(100).all()
|
||||
|
||||
# Group by control and calculate stats
|
||||
control_stats = defaultdict(lambda: {
|
||||
"total": 0,
|
||||
"valid": 0,
|
||||
"failed": 0,
|
||||
"last_collected": None,
|
||||
"evidence": [],
|
||||
})
|
||||
|
||||
for e in evidence_list:
|
||||
# Get control_id string
|
||||
control = db.query(ControlDB).filter(ControlDB.id == e.control_id).first()
|
||||
ctrl_id = control.control_id if control else "unknown"
|
||||
|
||||
stats = control_stats[ctrl_id]
|
||||
stats["total"] += 1
|
||||
if e.status:
|
||||
if e.status.value == "valid":
|
||||
stats["valid"] += 1
|
||||
elif e.status.value == "failed":
|
||||
stats["failed"] += 1
|
||||
if not stats["last_collected"] or e.collected_at > stats["last_collected"]:
|
||||
stats["last_collected"] = e.collected_at
|
||||
|
||||
# Add evidence summary
|
||||
stats["evidence"].append({
|
||||
"id": e.id,
|
||||
"type": e.evidence_type,
|
||||
"status": e.status.value if e.status else None,
|
||||
"collected_at": e.collected_at.isoformat() if e.collected_at else None,
|
||||
"ci_job_id": e.ci_job_id,
|
||||
})
|
||||
|
||||
# Convert to list and sort
|
||||
result = []
|
||||
for ctrl_id, stats in control_stats.items():
|
||||
result.append({
|
||||
"control_id": ctrl_id,
|
||||
"total_evidence": stats["total"],
|
||||
"valid_count": stats["valid"],
|
||||
"failed_count": stats["failed"],
|
||||
"last_collected": stats["last_collected"].isoformat() if stats["last_collected"] else None,
|
||||
"recent_evidence": stats["evidence"][:5],
|
||||
})
|
||||
|
||||
result.sort(key=lambda x: x["last_collected"] or "", reverse=True)
|
||||
|
||||
return {
|
||||
"period_days": days,
|
||||
"total_evidence": len(evidence_list),
|
||||
"controls": result,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Evidence Review (Anti-Fake-Evidence)
|
||||
# ============================================================================
|
||||
|
||||
from pydantic import BaseModel as _BaseModel
|
||||
|
||||
class _EvidenceReviewRequest(_BaseModel):
|
||||
confidence_level: Optional[str] = None
|
||||
truth_status: Optional[str] = None
|
||||
reviewed_by: str
|
||||
|
||||
|
||||
@router.patch("/evidence/{evidence_id}/review", response_model=EvidenceResponse)
|
||||
async def review_evidence(
|
||||
evidence_id: str,
|
||||
review: _EvidenceReviewRequest,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""
|
||||
Review evidence: upgrade confidence level and/or change truth status.
|
||||
|
||||
For Four-Eyes evidence, the first reviewer sets first_reviewer and
|
||||
approval_status='first_approved'. A second (different) reviewer then
|
||||
sets second_reviewer and approval_status='approved'.
|
||||
"""
|
||||
evidence = db.query(EvidenceDB).filter(EvidenceDB.id == evidence_id).first()
|
||||
if not evidence:
|
||||
raise HTTPException(status_code=404, detail=f"Evidence {evidence_id} not found")
|
||||
|
||||
old_confidence = evidence.confidence_level.value if evidence.confidence_level else None
|
||||
old_truth = evidence.truth_status.value if evidence.truth_status else None
|
||||
|
||||
if review.confidence_level:
|
||||
try:
|
||||
evidence.confidence_level = EvidenceConfidenceEnum(review.confidence_level)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid confidence_level: {review.confidence_level}")
|
||||
|
||||
if review.truth_status:
|
||||
try:
|
||||
evidence.truth_status = EvidenceTruthStatusEnum(review.truth_status)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid truth_status: {review.truth_status}")
|
||||
|
||||
# Four-Eyes branching
|
||||
if evidence.requires_four_eyes:
|
||||
status = evidence.approval_status or "none"
|
||||
if status in ("none", "pending_first"):
|
||||
evidence.first_reviewer = review.reviewed_by
|
||||
evidence.first_reviewed_at = datetime.utcnow()
|
||||
evidence.approval_status = "first_approved"
|
||||
elif status == "first_approved":
|
||||
if review.reviewed_by == evidence.first_reviewer:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Four-Eyes: second reviewer must be different from first reviewer",
|
||||
)
|
||||
evidence.second_reviewer = review.reviewed_by
|
||||
evidence.second_reviewed_at = datetime.utcnow()
|
||||
evidence.approval_status = "approved"
|
||||
elif status == "approved":
|
||||
raise HTTPException(status_code=400, detail="Evidence already approved")
|
||||
elif status == "rejected":
|
||||
raise HTTPException(status_code=400, detail="Evidence was rejected — create new evidence instead")
|
||||
|
||||
evidence.reviewed_by = review.reviewed_by
|
||||
evidence.reviewed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
# Audit trail
|
||||
new_confidence = evidence.confidence_level.value if evidence.confidence_level else None
|
||||
if old_confidence != new_confidence:
|
||||
log_audit_trail(
|
||||
db, "evidence", evidence_id, evidence.title, "review",
|
||||
performed_by=review.reviewed_by,
|
||||
field_changed="confidence_level",
|
||||
old_value=old_confidence,
|
||||
new_value=new_confidence,
|
||||
)
|
||||
new_truth = evidence.truth_status.value if evidence.truth_status else None
|
||||
if old_truth != new_truth:
|
||||
log_audit_trail(
|
||||
db, "evidence", evidence_id, evidence.title, "review",
|
||||
performed_by=review.reviewed_by,
|
||||
field_changed="truth_status",
|
||||
old_value=old_truth,
|
||||
new_value=new_truth,
|
||||
)
|
||||
db.commit()
|
||||
|
||||
db.refresh(evidence)
|
||||
return _build_evidence_response(evidence)
|
||||
|
||||
|
||||
@router.patch("/evidence/{evidence_id}/reject", response_model=EvidenceResponse)
|
||||
async def reject_evidence(
|
||||
evidence_id: str,
|
||||
body: EvidenceRejectRequest,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Reject evidence (sets approval_status='rejected')."""
|
||||
evidence = db.query(EvidenceDB).filter(EvidenceDB.id == evidence_id).first()
|
||||
if not evidence:
|
||||
raise HTTPException(status_code=404, detail=f"Evidence {evidence_id} not found")
|
||||
|
||||
evidence.approval_status = "rejected"
|
||||
evidence.reviewed_by = body.reviewed_by
|
||||
evidence.reviewed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
log_audit_trail(
|
||||
db, "evidence", evidence_id, evidence.title, "reject",
|
||||
performed_by=body.reviewed_by,
|
||||
change_summary=body.rejection_reason or "Evidence rejected",
|
||||
)
|
||||
db.commit()
|
||||
|
||||
db.refresh(evidence)
|
||||
return _build_evidence_response(evidence)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Audit Trail Query
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/audit-trail")
|
||||
async def get_audit_trail(
|
||||
entity_type: Optional[str] = Query(None),
|
||||
entity_id: Optional[str] = Query(None),
|
||||
action: Optional[str] = Query(None),
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Query audit trail entries for an entity."""
|
||||
query = db.query(AuditTrailDB)
|
||||
if entity_type:
|
||||
query = query.filter(AuditTrailDB.entity_type == entity_type)
|
||||
if entity_id:
|
||||
query = query.filter(AuditTrailDB.entity_id == entity_id)
|
||||
if action:
|
||||
query = query.filter(AuditTrailDB.action == action)
|
||||
|
||||
records = query.order_by(AuditTrailDB.performed_at.desc()).limit(limit).all()
|
||||
|
||||
return {
|
||||
"entries": [
|
||||
{
|
||||
"id": r.id,
|
||||
"entity_type": r.entity_type,
|
||||
"entity_id": r.entity_id,
|
||||
"entity_name": r.entity_name,
|
||||
"action": r.action,
|
||||
"field_changed": r.field_changed,
|
||||
"old_value": r.old_value,
|
||||
"new_value": r.new_value,
|
||||
"change_summary": r.change_summary,
|
||||
"performed_by": r.performed_by,
|
||||
"performed_at": r.performed_at.isoformat() if r.performed_at else None,
|
||||
"checksum": r.checksum,
|
||||
}
|
||||
for r in records
|
||||
],
|
||||
"total": len(records),
|
||||
}
|
||||
|
||||
@@ -39,7 +39,6 @@ router = APIRouter(tags=["extraction"])
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce", # BSI-TR documents — primary Prüfaspekte source
|
||||
"bp_compliance_recht", # Legal texts (GDPR, AI Act, ...)
|
||||
"bp_compliance_gesetze", # German laws
|
||||
"bp_compliance_datenschutz", # Data protection documents
|
||||
"bp_dsfa_corpus", # DSFA corpus
|
||||
|
||||
@@ -80,9 +80,13 @@ def _handle(func, *args, **kwargs): # type: ignore[no-untyped-def]
|
||||
raise HTTPException(status_code=400, detail=str(exc))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# ISMS Scope (ISO 27001 4.3)
|
||||
# ============================================================================
|
||||
# Shared audit trail utilities — canonical implementation in audit_trail_utils.py
|
||||
from .audit_trail_utils import log_audit_trail, create_signature # noqa: E402
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ISMS SCOPE (ISO 27001 4.3)
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/scope", response_model=ISMSScopeResponse)
|
||||
async def get_isms_scope(db: Session = Depends(get_db)):
|
||||
|
||||
@@ -50,6 +50,57 @@ VALID_DOCUMENT_TYPES = {
|
||||
"cookie_banner",
|
||||
"agb",
|
||||
"clause",
|
||||
# Security document templates (Migration 051)
|
||||
"it_security_concept",
|
||||
"data_protection_concept",
|
||||
"backup_recovery_concept",
|
||||
"logging_concept",
|
||||
"incident_response_plan",
|
||||
"access_control_concept",
|
||||
"risk_management_concept",
|
||||
# Policy templates — IT Security (Migration 054)
|
||||
"information_security_policy",
|
||||
"access_control_policy",
|
||||
"password_policy",
|
||||
"encryption_policy",
|
||||
"logging_policy",
|
||||
"backup_policy",
|
||||
"incident_response_policy",
|
||||
"change_management_policy",
|
||||
"patch_management_policy",
|
||||
"asset_management_policy",
|
||||
"cloud_security_policy",
|
||||
"devsecops_policy",
|
||||
"secrets_management_policy",
|
||||
"vulnerability_management_policy",
|
||||
# Policy templates — Data (Migration 054)
|
||||
"data_protection_policy",
|
||||
"data_classification_policy",
|
||||
"data_retention_policy",
|
||||
"data_transfer_policy",
|
||||
"privacy_incident_policy",
|
||||
# Policy templates — Personnel (Migration 054)
|
||||
"employee_security_policy",
|
||||
"security_awareness_policy",
|
||||
"remote_work_policy",
|
||||
"offboarding_policy",
|
||||
# Policy templates — Vendor/Supply Chain (Migration 054)
|
||||
"vendor_risk_management_policy",
|
||||
"third_party_security_policy",
|
||||
"supplier_security_policy",
|
||||
# Policy templates — BCM (Migration 054)
|
||||
"business_continuity_policy",
|
||||
"disaster_recovery_policy",
|
||||
"crisis_management_policy",
|
||||
# CRA Cybersecurity (Migration 056)
|
||||
"cybersecurity_policy",
|
||||
# DSFA template
|
||||
"dsfa",
|
||||
# Module document templates (Migration 073)
|
||||
"vvt_register",
|
||||
"tom_documentation",
|
||||
"loeschkonzept",
|
||||
"pflichtenregister",
|
||||
}
|
||||
VALID_STATUSES = {"published", "draft", "archived"}
|
||||
|
||||
|
||||
162
backend-compliance/compliance/api/llm_audit_routes.py
Normal file
162
backend-compliance/compliance/api/llm_audit_routes.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""
|
||||
FastAPI routes for LLM Generation Audit Trail.
|
||||
|
||||
Endpoints:
|
||||
- POST /llm-audit: Record an LLM generation event
|
||||
- GET /llm-audit: List audit records with filters
|
||||
"""
|
||||
|
||||
import logging
|
||||
import uuid as uuid_module
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
from ..db.models import LLMGenerationAuditDB
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["compliance-llm-audit"])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Schemas
|
||||
# ============================================================================
|
||||
|
||||
class LLMAuditCreate(BaseModel):
|
||||
entity_type: str
|
||||
entity_id: Optional[str] = None
|
||||
generation_mode: str
|
||||
truth_status: str = "generated"
|
||||
may_be_used_as_evidence: bool = False
|
||||
llm_model: Optional[str] = None
|
||||
llm_provider: Optional[str] = None
|
||||
prompt_hash: Optional[str] = None
|
||||
input_summary: Optional[str] = None
|
||||
output_summary: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
tenant_id: Optional[str] = None
|
||||
|
||||
|
||||
class LLMAuditResponse(BaseModel):
|
||||
id: str
|
||||
tenant_id: Optional[str] = None
|
||||
entity_type: str
|
||||
entity_id: Optional[str] = None
|
||||
generation_mode: str
|
||||
truth_status: str
|
||||
may_be_used_as_evidence: bool
|
||||
llm_model: Optional[str] = None
|
||||
llm_provider: Optional[str] = None
|
||||
prompt_hash: Optional[str] = None
|
||||
input_summary: Optional[str] = None
|
||||
output_summary: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Routes
|
||||
# ============================================================================
|
||||
|
||||
@router.post("/llm-audit", response_model=LLMAuditResponse)
|
||||
async def create_llm_audit(
|
||||
data: LLMAuditCreate,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Record an LLM generation event for audit trail."""
|
||||
from ..db.models import EvidenceTruthStatusEnum
|
||||
|
||||
# Validate truth_status
|
||||
try:
|
||||
truth_enum = EvidenceTruthStatusEnum(data.truth_status)
|
||||
except ValueError:
|
||||
truth_enum = EvidenceTruthStatusEnum.GENERATED
|
||||
|
||||
record = LLMGenerationAuditDB(
|
||||
id=str(uuid_module.uuid4()),
|
||||
tenant_id=data.tenant_id,
|
||||
entity_type=data.entity_type,
|
||||
entity_id=data.entity_id,
|
||||
generation_mode=data.generation_mode,
|
||||
truth_status=truth_enum,
|
||||
may_be_used_as_evidence=data.may_be_used_as_evidence,
|
||||
llm_model=data.llm_model,
|
||||
llm_provider=data.llm_provider,
|
||||
prompt_hash=data.prompt_hash,
|
||||
input_summary=data.input_summary[:500] if data.input_summary else None,
|
||||
output_summary=data.output_summary[:500] if data.output_summary else None,
|
||||
extra_metadata=data.metadata or {},
|
||||
)
|
||||
db.add(record)
|
||||
db.commit()
|
||||
db.refresh(record)
|
||||
|
||||
return LLMAuditResponse(
|
||||
id=record.id,
|
||||
tenant_id=record.tenant_id,
|
||||
entity_type=record.entity_type,
|
||||
entity_id=record.entity_id,
|
||||
generation_mode=record.generation_mode,
|
||||
truth_status=record.truth_status.value if record.truth_status else "generated",
|
||||
may_be_used_as_evidence=record.may_be_used_as_evidence,
|
||||
llm_model=record.llm_model,
|
||||
llm_provider=record.llm_provider,
|
||||
prompt_hash=record.prompt_hash,
|
||||
input_summary=record.input_summary,
|
||||
output_summary=record.output_summary,
|
||||
metadata=record.extra_metadata,
|
||||
created_at=record.created_at,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/llm-audit")
|
||||
async def list_llm_audit(
|
||||
entity_type: Optional[str] = Query(None),
|
||||
entity_id: Optional[str] = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""List LLM generation audit records with optional filters."""
|
||||
query = db.query(LLMGenerationAuditDB)
|
||||
|
||||
if entity_type:
|
||||
query = query.filter(LLMGenerationAuditDB.entity_type == entity_type)
|
||||
if entity_id:
|
||||
query = query.filter(LLMGenerationAuditDB.entity_id == entity_id)
|
||||
|
||||
total = query.count()
|
||||
offset = (page - 1) * limit
|
||||
records = query.order_by(LLMGenerationAuditDB.created_at.desc()).offset(offset).limit(limit).all()
|
||||
|
||||
return {
|
||||
"records": [
|
||||
LLMAuditResponse(
|
||||
id=r.id,
|
||||
tenant_id=r.tenant_id,
|
||||
entity_type=r.entity_type,
|
||||
entity_id=r.entity_id,
|
||||
generation_mode=r.generation_mode,
|
||||
truth_status=r.truth_status.value if r.truth_status else "generated",
|
||||
may_be_used_as_evidence=r.may_be_used_as_evidence,
|
||||
llm_model=r.llm_model,
|
||||
llm_provider=r.llm_provider,
|
||||
prompt_hash=r.prompt_hash,
|
||||
input_summary=r.input_summary,
|
||||
output_summary=r.output_summary,
|
||||
metadata=r.extra_metadata,
|
||||
created_at=r.created_at,
|
||||
)
|
||||
for r in records
|
||||
],
|
||||
"total": total,
|
||||
"page": page,
|
||||
"limit": limit,
|
||||
}
|
||||
@@ -56,6 +56,7 @@ class LoeschfristCreate(BaseModel):
|
||||
responsible_person: Optional[str] = None
|
||||
release_process: Optional[str] = None
|
||||
linked_vvt_activity_ids: Optional[List[Any]] = None
|
||||
linked_vendor_ids: Optional[List[Any]] = None
|
||||
status: str = "DRAFT"
|
||||
last_review_date: Optional[datetime] = None
|
||||
next_review_date: Optional[datetime] = None
|
||||
@@ -86,6 +87,7 @@ class LoeschfristUpdate(BaseModel):
|
||||
responsible_person: Optional[str] = None
|
||||
release_process: Optional[str] = None
|
||||
linked_vvt_activity_ids: Optional[List[Any]] = None
|
||||
linked_vendor_ids: Optional[List[Any]] = None
|
||||
status: Optional[str] = None
|
||||
last_review_date: Optional[datetime] = None
|
||||
next_review_date: Optional[datetime] = None
|
||||
@@ -100,7 +102,7 @@ class StatusUpdate(BaseModel):
|
||||
# JSONB fields that need CAST
|
||||
JSONB_FIELDS = {
|
||||
"affected_groups", "data_categories", "legal_holds",
|
||||
"storage_locations", "linked_vvt_activity_ids", "tags"
|
||||
"storage_locations", "linked_vvt_activity_ids", "linked_vendor_ids", "tags"
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ class ObligationCreate(BaseModel):
|
||||
priority: str = "medium"
|
||||
responsible: Optional[str] = None
|
||||
linked_systems: Optional[List[str]] = None
|
||||
linked_vendor_ids: Optional[List[str]] = None
|
||||
assessment_id: Optional[str] = None
|
||||
rule_code: Optional[str] = None
|
||||
notes: Optional[str] = None
|
||||
@@ -57,6 +58,7 @@ class ObligationUpdate(BaseModel):
|
||||
priority: Optional[str] = None
|
||||
responsible: Optional[str] = None
|
||||
linked_systems: Optional[List[str]] = None
|
||||
linked_vendor_ids: Optional[List[str]] = None
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
@@ -173,14 +175,15 @@ async def create_obligation(
|
||||
|
||||
import json
|
||||
linked_systems = json.dumps(payload.linked_systems or [])
|
||||
linked_vendor_ids = json.dumps(payload.linked_vendor_ids or [])
|
||||
|
||||
row = db.execute(text("""
|
||||
INSERT INTO compliance_obligations
|
||||
(tenant_id, title, description, source, source_article, deadline,
|
||||
status, priority, responsible, linked_systems, assessment_id, rule_code, notes)
|
||||
status, priority, responsible, linked_systems, linked_vendor_ids, assessment_id, rule_code, notes)
|
||||
VALUES
|
||||
(:tenant_id, :title, :description, :source, :source_article, :deadline,
|
||||
:status, :priority, :responsible, CAST(:linked_systems AS jsonb), :assessment_id, :rule_code, :notes)
|
||||
:status, :priority, :responsible, CAST(:linked_systems AS jsonb), CAST(:linked_vendor_ids AS jsonb), :assessment_id, :rule_code, :notes)
|
||||
RETURNING *
|
||||
"""), {
|
||||
"tenant_id": tenant_id,
|
||||
@@ -193,6 +196,7 @@ async def create_obligation(
|
||||
"priority": payload.priority,
|
||||
"responsible": payload.responsible,
|
||||
"linked_systems": linked_systems,
|
||||
"linked_vendor_ids": linked_vendor_ids,
|
||||
"assessment_id": payload.assessment_id,
|
||||
"rule_code": payload.rule_code,
|
||||
"notes": payload.notes,
|
||||
@@ -235,6 +239,9 @@ async def update_obligation(
|
||||
if field == "linked_systems":
|
||||
updates["linked_systems"] = json.dumps(value or [])
|
||||
set_clauses.append("linked_systems = CAST(:linked_systems AS jsonb)")
|
||||
elif field == "linked_vendor_ids":
|
||||
updates["linked_vendor_ids"] = json.dumps(value or [])
|
||||
set_clauses.append("linked_vendor_ids = CAST(:linked_vendor_ids AS jsonb)")
|
||||
else:
|
||||
updates[field] = value
|
||||
set_clauses.append(f"{field} = :{field}")
|
||||
|
||||
1072
backend-compliance/compliance/api/process_task_routes.py
Normal file
1072
backend-compliance/compliance/api/process_task_routes.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -25,6 +25,7 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
|
||||
from .audit_trail_utils import log_audit_trail
|
||||
from ..db import (
|
||||
ControlDomainEnum,
|
||||
ControlRepository,
|
||||
@@ -312,8 +313,39 @@ async def get_control(
|
||||
svc: ControlExportService = Depends(get_ctrl_export_service),
|
||||
) -> ControlResponse:
|
||||
"""Get a specific control by control_id."""
|
||||
with translate_domain_errors():
|
||||
return svc.get_control(control_id)
|
||||
repo = ControlRepository(db)
|
||||
control = repo.get_by_control_id(control_id)
|
||||
if not control:
|
||||
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
|
||||
|
||||
evidence_repo = EvidenceRepository(db)
|
||||
evidence = evidence_repo.get_by_control(control.id)
|
||||
|
||||
return ControlResponse(
|
||||
id=control.id,
|
||||
control_id=control.control_id,
|
||||
domain=control.domain.value if control.domain else None,
|
||||
control_type=control.control_type.value if control.control_type else None,
|
||||
title=control.title,
|
||||
description=control.description,
|
||||
pass_criteria=control.pass_criteria,
|
||||
implementation_guidance=control.implementation_guidance,
|
||||
code_reference=control.code_reference,
|
||||
documentation_url=control.documentation_url,
|
||||
is_automated=control.is_automated,
|
||||
automation_tool=control.automation_tool,
|
||||
automation_config=control.automation_config,
|
||||
owner=control.owner,
|
||||
review_frequency_days=control.review_frequency_days,
|
||||
status=control.status.value if control.status else None,
|
||||
status_notes=control.status_notes,
|
||||
status_justification=control.status_justification,
|
||||
last_reviewed_at=control.last_reviewed_at,
|
||||
next_review_at=control.next_review_at,
|
||||
created_at=control.created_at,
|
||||
updated_at=control.updated_at,
|
||||
evidence_count=len(evidence),
|
||||
)
|
||||
|
||||
|
||||
@router.put(
|
||||
@@ -325,8 +357,83 @@ async def update_control(
|
||||
svc: ControlExportService = Depends(get_ctrl_export_service),
|
||||
) -> ControlResponse:
|
||||
"""Update a control."""
|
||||
with translate_domain_errors():
|
||||
return svc.update_control(control_id, update)
|
||||
repo = ControlRepository(db)
|
||||
control = repo.get_by_control_id(control_id)
|
||||
if not control:
|
||||
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
|
||||
|
||||
update_data = update.model_dump(exclude_unset=True)
|
||||
|
||||
# Convert status string to enum and validate transition
|
||||
if "status" in update_data:
|
||||
try:
|
||||
new_status_enum = ControlStatusEnum(update_data["status"])
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid status: {update_data['status']}")
|
||||
|
||||
# Validate status transition (Anti-Fake-Evidence)
|
||||
from ..services.control_status_machine import validate_transition
|
||||
current_status = control.status.value if control.status else "planned"
|
||||
evidence_list = db.query(EvidenceDB).filter(EvidenceDB.control_id == control.id).all()
|
||||
allowed, violations = validate_transition(
|
||||
current_status=current_status,
|
||||
new_status=update_data["status"],
|
||||
evidence_list=evidence_list,
|
||||
status_justification=update_data.get("status_justification") or update_data.get("status_notes"),
|
||||
)
|
||||
if not allowed:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail={
|
||||
"error": "Status transition not allowed",
|
||||
"current_status": current_status,
|
||||
"requested_status": update_data["status"],
|
||||
"violations": violations,
|
||||
}
|
||||
)
|
||||
|
||||
update_data["status"] = new_status_enum
|
||||
|
||||
updated = repo.update(control.id, **update_data)
|
||||
db.commit()
|
||||
|
||||
# Audit trail for status changes
|
||||
new_status = updated.status.value if updated.status else None
|
||||
if "status" in update.model_dump(exclude_unset=True) and current_status != new_status:
|
||||
log_audit_trail(
|
||||
db, "control", control.id, updated.control_id or updated.title,
|
||||
"status_change",
|
||||
performed_by=update.owner or "system",
|
||||
field_changed="status",
|
||||
old_value=current_status,
|
||||
new_value=new_status,
|
||||
)
|
||||
db.commit()
|
||||
|
||||
return ControlResponse(
|
||||
id=updated.id,
|
||||
control_id=updated.control_id,
|
||||
domain=updated.domain.value if updated.domain else None,
|
||||
control_type=updated.control_type.value if updated.control_type else None,
|
||||
title=updated.title,
|
||||
description=updated.description,
|
||||
pass_criteria=updated.pass_criteria,
|
||||
implementation_guidance=updated.implementation_guidance,
|
||||
code_reference=updated.code_reference,
|
||||
documentation_url=updated.documentation_url,
|
||||
is_automated=updated.is_automated,
|
||||
automation_tool=updated.automation_tool,
|
||||
automation_config=updated.automation_config,
|
||||
owner=updated.owner,
|
||||
review_frequency_days=updated.review_frequency_days,
|
||||
status=updated.status.value if updated.status else None,
|
||||
status_notes=updated.status_notes,
|
||||
status_justification=updated.status_justification,
|
||||
last_reviewed_at=updated.last_reviewed_at,
|
||||
next_review_at=updated.next_review_at,
|
||||
created_at=updated.created_at,
|
||||
updated_at=updated.updated_at,
|
||||
)
|
||||
|
||||
|
||||
@router.put(
|
||||
@@ -339,8 +446,43 @@ async def review_control(
|
||||
svc: ControlExportService = Depends(get_ctrl_export_service),
|
||||
) -> ControlResponse:
|
||||
"""Mark a control as reviewed with new status."""
|
||||
with translate_domain_errors():
|
||||
return svc.review_control(control_id, review)
|
||||
repo = ControlRepository(db)
|
||||
control = repo.get_by_control_id(control_id)
|
||||
if not control:
|
||||
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
|
||||
|
||||
try:
|
||||
status_enum = ControlStatusEnum(review.status)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid status: {review.status}")
|
||||
|
||||
updated = repo.mark_reviewed(control.id, status_enum, review.status_notes)
|
||||
db.commit()
|
||||
|
||||
return ControlResponse(
|
||||
id=updated.id,
|
||||
control_id=updated.control_id,
|
||||
domain=updated.domain.value if updated.domain else None,
|
||||
control_type=updated.control_type.value if updated.control_type else None,
|
||||
title=updated.title,
|
||||
description=updated.description,
|
||||
pass_criteria=updated.pass_criteria,
|
||||
implementation_guidance=updated.implementation_guidance,
|
||||
code_reference=updated.code_reference,
|
||||
documentation_url=updated.documentation_url,
|
||||
is_automated=updated.is_automated,
|
||||
automation_tool=updated.automation_tool,
|
||||
automation_config=updated.automation_config,
|
||||
owner=updated.owner,
|
||||
review_frequency_days=updated.review_frequency_days,
|
||||
status=updated.status.value if updated.status else None,
|
||||
status_notes=updated.status_notes,
|
||||
status_justification=updated.status_justification,
|
||||
last_reviewed_at=updated.last_reviewed_at,
|
||||
next_review_at=updated.next_review_at,
|
||||
created_at=updated.created_at,
|
||||
updated_at=updated.updated_at,
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -22,7 +22,9 @@ import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
||||
import httpx
|
||||
from fastapi import APIRouter, File, Form, UploadFile, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import text
|
||||
|
||||
from database import SessionLocal # re-exported below for legacy test patches
|
||||
@@ -96,15 +98,13 @@ async def scan_dependencies(
|
||||
db = SessionLocal()
|
||||
try:
|
||||
db.execute(
|
||||
text(
|
||||
"INSERT INTO compliance_screenings "
|
||||
"(id, tenant_id, status, sbom_format, sbom_version, "
|
||||
"total_components, total_issues, critical_issues, high_issues, "
|
||||
"medium_issues, low_issues, sbom_data, started_at, completed_at) "
|
||||
"VALUES (:id, :tenant_id, 'completed', 'CycloneDX', '1.5', "
|
||||
":total_components, :total_issues, :critical, :high, :medium, :low, "
|
||||
":sbom_data::jsonb, :started_at, :completed_at)"
|
||||
),
|
||||
text("""INSERT INTO compliance_screenings
|
||||
(id, tenant_id, status, sbom_format, sbom_version,
|
||||
total_components, total_issues, critical_issues, high_issues, medium_issues, low_issues,
|
||||
sbom_data, started_at, completed_at)
|
||||
VALUES (:id, :tenant_id, 'completed', 'CycloneDX', '1.5',
|
||||
:total_components, :total_issues, :critical, :high, :medium, :low,
|
||||
:sbom_data::jsonb, :started_at, :completed_at)"""),
|
||||
{
|
||||
"id": screening_id,
|
||||
"tenant_id": tenant_id,
|
||||
@@ -121,13 +121,11 @@ async def scan_dependencies(
|
||||
)
|
||||
for issue in issues:
|
||||
db.execute(
|
||||
text(
|
||||
"INSERT INTO compliance_security_issues "
|
||||
"(id, screening_id, severity, title, description, cve, cvss, "
|
||||
"affected_component, affected_version, fixed_in, remediation, status) "
|
||||
"VALUES (:id, :screening_id, :severity, :title, :description, :cve, :cvss, "
|
||||
":component, :version, :fixed_in, :remediation, :status)"
|
||||
),
|
||||
text("""INSERT INTO compliance_security_issues
|
||||
(id, screening_id, severity, title, description, cve, cvss,
|
||||
affected_component, affected_version, fixed_in, remediation, status)
|
||||
VALUES (:id, :screening_id, :severity, :title, :description, :cve, :cvss,
|
||||
:component, :version, :fixed_in, :remediation, :status)"""),
|
||||
{
|
||||
"id": issue["id"],
|
||||
"screening_id": screening_id,
|
||||
@@ -214,8 +212,77 @@ async def get_screening(screening_id: str) -> ScreeningResponse:
|
||||
"""Get a screening result by ID."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
with translate_domain_errors():
|
||||
return ScreeningService(db).get_screening(screening_id)
|
||||
result = db.execute(
|
||||
text("""SELECT id, status, sbom_format, sbom_version,
|
||||
total_components, total_issues, critical_issues, high_issues,
|
||||
medium_issues, low_issues, sbom_data, started_at, completed_at
|
||||
FROM compliance_screenings WHERE id = :id"""),
|
||||
{"id": screening_id},
|
||||
)
|
||||
row = result.fetchone()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Screening not found")
|
||||
|
||||
# Fetch issues
|
||||
issues_result = db.execute(
|
||||
text("""SELECT id, severity, title, description, cve, cvss,
|
||||
affected_component, affected_version, fixed_in, remediation, status
|
||||
FROM compliance_security_issues WHERE screening_id = :id"""),
|
||||
{"id": screening_id},
|
||||
)
|
||||
issues_rows = issues_result.fetchall()
|
||||
|
||||
issues = [
|
||||
SecurityIssueResponse(
|
||||
id=str(r[0]), severity=r[1], title=r[2], description=r[3],
|
||||
cve=r[4], cvss=r[5], affected_component=r[6],
|
||||
affected_version=r[7], fixed_in=r[8], remediation=r[9], status=r[10],
|
||||
)
|
||||
for r in issues_rows
|
||||
]
|
||||
|
||||
# Reconstruct components from SBOM data
|
||||
sbom_data = row[10] or {}
|
||||
components = []
|
||||
comp_vulns: dict[str, list[dict]] = {}
|
||||
for issue in issues:
|
||||
if issue.affected_component not in comp_vulns:
|
||||
comp_vulns[issue.affected_component] = []
|
||||
comp_vulns[issue.affected_component].append({
|
||||
"id": issue.cve or issue.id,
|
||||
"cve": issue.cve,
|
||||
"severity": issue.severity,
|
||||
"title": issue.title,
|
||||
"cvss": issue.cvss,
|
||||
"fixedIn": issue.fixed_in,
|
||||
})
|
||||
|
||||
for sc in sbom_data.get("components", []):
|
||||
components.append(SBOMComponentResponse(
|
||||
name=sc["name"],
|
||||
version=sc["version"],
|
||||
type=sc.get("type", "library"),
|
||||
purl=sc.get("purl", ""),
|
||||
licenses=sc.get("licenses", []),
|
||||
vulnerabilities=comp_vulns.get(sc["name"], []),
|
||||
))
|
||||
|
||||
return ScreeningResponse(
|
||||
id=str(row[0]),
|
||||
status=row[1],
|
||||
sbom_format=row[2] or "CycloneDX",
|
||||
sbom_version=row[3] or "1.5",
|
||||
total_components=row[4] or 0,
|
||||
total_issues=row[5] or 0,
|
||||
critical_issues=row[6] or 0,
|
||||
high_issues=row[7] or 0,
|
||||
medium_issues=row[8] or 0,
|
||||
low_issues=row[9] or 0,
|
||||
components=components,
|
||||
issues=issues,
|
||||
started_at=str(row[11]) if row[11] else None,
|
||||
completed_at=str(row[12]) if row[12] else None,
|
||||
)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@@ -225,8 +292,33 @@ async def list_screenings(tenant_id: str = "default") -> ScreeningListResponse:
|
||||
"""List all screenings for a tenant."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
with translate_domain_errors():
|
||||
return ScreeningService(db).list_screenings(tenant_id)
|
||||
result = db.execute(
|
||||
text("""SELECT id, status, total_components, total_issues,
|
||||
critical_issues, high_issues, medium_issues, low_issues,
|
||||
started_at, completed_at, created_at
|
||||
FROM compliance_screenings
|
||||
WHERE tenant_id = :tenant_id
|
||||
ORDER BY created_at DESC"""),
|
||||
{"tenant_id": tenant_id},
|
||||
)
|
||||
rows = result.fetchall()
|
||||
screenings = [
|
||||
{
|
||||
"id": str(r[0]),
|
||||
"status": r[1],
|
||||
"total_components": r[2],
|
||||
"total_issues": r[3],
|
||||
"critical_issues": r[4],
|
||||
"high_issues": r[5],
|
||||
"medium_issues": r[6],
|
||||
"low_issues": r[7],
|
||||
"started_at": str(r[8]) if r[8] else None,
|
||||
"completed_at": str(r[9]) if r[9] else None,
|
||||
"created_at": str(r[10]),
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
return ScreeningListResponse(screenings=screenings, total=len(screenings))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
537
backend-compliance/compliance/api/tom_mapping_routes.py
Normal file
537
backend-compliance/compliance/api/tom_mapping_routes.py
Normal file
@@ -0,0 +1,537 @@
|
||||
"""
|
||||
TOM ↔ Canonical Control Mapping Routes.
|
||||
|
||||
Three-layer architecture:
|
||||
TOM Measures (~88, audit-level) → Mapping Bridge → Canonical Controls (10,000+)
|
||||
|
||||
Endpoints:
|
||||
POST /v1/tom-mappings/sync — Sync canonical controls for company profile
|
||||
GET /v1/tom-mappings — List all mappings for tenant/project
|
||||
GET /v1/tom-mappings/by-tom/{code} — Mappings for a specific TOM control
|
||||
GET /v1/tom-mappings/stats — Coverage statistics
|
||||
POST /v1/tom-mappings/manual — Manually add a mapping
|
||||
DELETE /v1/tom-mappings/{id} — Remove a mapping
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query, Header
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import text
|
||||
|
||||
from database import SessionLocal
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/tom-mappings", tags=["tom-control-mappings"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TOM CATEGORY → CANONICAL CATEGORY MAPPING
|
||||
# =============================================================================
|
||||
|
||||
# Maps 13 TOM control categories to canonical_control_categories
|
||||
# Each TOM category maps to 1-3 canonical categories for broad coverage
|
||||
TOM_TO_CANONICAL_CATEGORIES: dict[str, list[str]] = {
|
||||
"ACCESS_CONTROL": ["authentication", "identity", "physical"],
|
||||
"ADMISSION_CONTROL": ["authentication", "identity", "system"],
|
||||
"ACCESS_AUTHORIZATION": ["authentication", "identity"],
|
||||
"TRANSFER_CONTROL": ["network", "data_protection", "encryption"],
|
||||
"INPUT_CONTROL": ["application", "data_protection"],
|
||||
"ORDER_CONTROL": ["supply_chain", "compliance"],
|
||||
"AVAILABILITY": ["continuity", "system"],
|
||||
"SEPARATION": ["network", "data_protection"],
|
||||
"ENCRYPTION": ["encryption"],
|
||||
"PSEUDONYMIZATION": ["data_protection", "encryption"],
|
||||
"RESILIENCE": ["continuity", "system"],
|
||||
"RECOVERY": ["continuity"],
|
||||
"REVIEW": ["compliance", "governance", "risk"],
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# REQUEST / RESPONSE MODELS
|
||||
# =============================================================================
|
||||
|
||||
class SyncRequest(BaseModel):
|
||||
"""Trigger a sync of canonical controls to TOM measures."""
|
||||
industry: Optional[str] = None
|
||||
company_size: Optional[str] = None
|
||||
force: bool = False
|
||||
|
||||
|
||||
class ManualMappingRequest(BaseModel):
|
||||
"""Manually add a canonical control to a TOM measure."""
|
||||
tom_control_code: str
|
||||
tom_category: str
|
||||
canonical_control_id: str
|
||||
canonical_control_code: str
|
||||
canonical_category: Optional[str] = None
|
||||
relevance_score: float = 1.0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# HELPERS
|
||||
# =============================================================================
|
||||
|
||||
def _get_tenant_id(x_tenant_id: Optional[str]) -> str:
|
||||
"""Extract tenant ID from header."""
|
||||
if not x_tenant_id:
|
||||
raise HTTPException(status_code=400, detail="X-Tenant-ID header required")
|
||||
return x_tenant_id
|
||||
|
||||
|
||||
def _compute_profile_hash(industry: Optional[str], company_size: Optional[str]) -> str:
|
||||
"""Compute a hash from profile parameters for change detection."""
|
||||
data = json.dumps({"industry": industry, "company_size": company_size}, sort_keys=True)
|
||||
return hashlib.sha256(data.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def _mapping_row_to_dict(r) -> dict[str, Any]:
|
||||
"""Convert a mapping row to API response dict."""
|
||||
return {
|
||||
"id": str(r.id),
|
||||
"tenant_id": str(r.tenant_id),
|
||||
"project_id": str(r.project_id) if r.project_id else None,
|
||||
"tom_control_code": r.tom_control_code,
|
||||
"tom_category": r.tom_category,
|
||||
"canonical_control_id": str(r.canonical_control_id),
|
||||
"canonical_control_code": r.canonical_control_code,
|
||||
"canonical_category": r.canonical_category,
|
||||
"mapping_type": r.mapping_type,
|
||||
"relevance_score": float(r.relevance_score) if r.relevance_score else 1.0,
|
||||
"created_at": r.created_at.isoformat() if r.created_at else None,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SYNC ENDPOINT
|
||||
# =============================================================================
|
||||
|
||||
@router.post("/sync")
|
||||
async def sync_mappings(
|
||||
body: SyncRequest,
|
||||
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
||||
project_id: Optional[str] = Query(None),
|
||||
):
|
||||
"""
|
||||
Sync canonical controls to TOM measures based on company profile.
|
||||
|
||||
Algorithm:
|
||||
1. Compute profile hash → skip if unchanged (unless force=True)
|
||||
2. For each TOM category, find matching canonical controls by:
|
||||
- Category mapping (TOM category → canonical categories)
|
||||
- Industry filter (applicable_industries JSONB containment)
|
||||
- Company size filter (applicable_company_size JSONB containment)
|
||||
- Only approved + customer_visible controls
|
||||
3. Delete old auto-mappings, insert new ones
|
||||
4. Update sync state
|
||||
"""
|
||||
tenant_id = _get_tenant_id(x_tenant_id)
|
||||
profile_hash = _compute_profile_hash(body.industry, body.company_size)
|
||||
|
||||
with SessionLocal() as db:
|
||||
# Check if sync is needed (profile unchanged)
|
||||
if not body.force:
|
||||
existing = db.execute(
|
||||
text("""
|
||||
SELECT profile_hash FROM tom_control_sync_state
|
||||
WHERE tenant_id = :tid AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
|
||||
"""),
|
||||
{"tid": tenant_id, "pid": project_id},
|
||||
).fetchone()
|
||||
if existing and existing.profile_hash == profile_hash:
|
||||
return {
|
||||
"status": "unchanged",
|
||||
"message": "Profile unchanged since last sync",
|
||||
"profile_hash": profile_hash,
|
||||
}
|
||||
|
||||
# Delete old auto-mappings for this tenant+project
|
||||
db.execute(
|
||||
text("""
|
||||
DELETE FROM tom_control_mappings
|
||||
WHERE tenant_id = :tid
|
||||
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
|
||||
AND mapping_type = 'auto'
|
||||
"""),
|
||||
{"tid": tenant_id, "pid": project_id},
|
||||
)
|
||||
|
||||
total_mappings = 0
|
||||
canonical_ids_matched = set()
|
||||
tom_codes_covered = set()
|
||||
|
||||
# For each TOM category, find matching canonical controls
|
||||
for tom_category, canonical_categories in TOM_TO_CANONICAL_CATEGORIES.items():
|
||||
# Build JSONB containment query for categories
|
||||
cat_conditions = " OR ".join(
|
||||
f"category = :cat_{i}" for i in range(len(canonical_categories))
|
||||
)
|
||||
cat_params = {f"cat_{i}": c for i, c in enumerate(canonical_categories)}
|
||||
|
||||
# Build industry filter
|
||||
industry_filter = ""
|
||||
if body.industry:
|
||||
industry_filter = """
|
||||
AND (
|
||||
applicable_industries IS NULL
|
||||
OR applicable_industries @> '"all"'::jsonb
|
||||
OR applicable_industries @> (:industry)::jsonb
|
||||
)
|
||||
"""
|
||||
cat_params["industry"] = json.dumps([body.industry])
|
||||
|
||||
# Build company size filter
|
||||
size_filter = ""
|
||||
if body.company_size:
|
||||
size_filter = """
|
||||
AND (
|
||||
applicable_company_size IS NULL
|
||||
OR applicable_company_size @> '"all"'::jsonb
|
||||
OR applicable_company_size @> (:csize)::jsonb
|
||||
)
|
||||
"""
|
||||
cat_params["csize"] = json.dumps([body.company_size])
|
||||
|
||||
query = f"""
|
||||
SELECT id, control_id, category
|
||||
FROM canonical_controls
|
||||
WHERE ({cat_conditions})
|
||||
AND release_state = 'approved'
|
||||
AND customer_visible = true
|
||||
{industry_filter}
|
||||
{size_filter}
|
||||
ORDER BY control_id
|
||||
"""
|
||||
|
||||
rows = db.execute(text(query), cat_params).fetchall()
|
||||
|
||||
# Find TOM control codes in this category (query the frontend library
|
||||
# codes; we use the category prefix pattern from the loader)
|
||||
# TOM codes follow pattern: TOM-XX-NN where XX is category abbreviation
|
||||
# We insert one mapping per canonical control per TOM category
|
||||
for row in rows:
|
||||
db.execute(
|
||||
text("""
|
||||
INSERT INTO tom_control_mappings (
|
||||
tenant_id, project_id, tom_control_code, tom_category,
|
||||
canonical_control_id, canonical_control_code, canonical_category,
|
||||
mapping_type, relevance_score
|
||||
) VALUES (
|
||||
:tid, :pid, :tom_cat, :tom_cat,
|
||||
:cc_id, :cc_code, :cc_category,
|
||||
'auto', 1.00
|
||||
)
|
||||
ON CONFLICT (tenant_id, project_id, tom_control_code, canonical_control_id)
|
||||
DO NOTHING
|
||||
"""),
|
||||
{
|
||||
"tid": tenant_id,
|
||||
"pid": project_id,
|
||||
"tom_cat": tom_category,
|
||||
"cc_id": str(row.id),
|
||||
"cc_code": row.control_id,
|
||||
"cc_category": row.category,
|
||||
},
|
||||
)
|
||||
total_mappings += 1
|
||||
canonical_ids_matched.add(str(row.id))
|
||||
tom_codes_covered.add(tom_category)
|
||||
|
||||
# Upsert sync state
|
||||
db.execute(
|
||||
text("""
|
||||
INSERT INTO tom_control_sync_state (
|
||||
tenant_id, project_id, profile_hash,
|
||||
total_mappings, canonical_controls_matched, tom_controls_covered,
|
||||
last_synced_at
|
||||
) VALUES (
|
||||
:tid, :pid, :hash,
|
||||
:total, :matched, :covered,
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (tenant_id, project_id)
|
||||
DO UPDATE SET
|
||||
profile_hash = :hash,
|
||||
total_mappings = :total,
|
||||
canonical_controls_matched = :matched,
|
||||
tom_controls_covered = :covered,
|
||||
last_synced_at = NOW()
|
||||
"""),
|
||||
{
|
||||
"tid": tenant_id,
|
||||
"pid": project_id,
|
||||
"hash": profile_hash,
|
||||
"total": total_mappings,
|
||||
"matched": len(canonical_ids_matched),
|
||||
"covered": len(tom_codes_covered),
|
||||
},
|
||||
)
|
||||
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"status": "synced",
|
||||
"profile_hash": profile_hash,
|
||||
"total_mappings": total_mappings,
|
||||
"canonical_controls_matched": len(canonical_ids_matched),
|
||||
"tom_categories_covered": len(tom_codes_covered),
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST MAPPINGS
|
||||
# =============================================================================
|
||||
|
||||
@router.get("")
|
||||
async def list_mappings(
|
||||
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
||||
project_id: Optional[str] = Query(None),
|
||||
tom_category: Optional[str] = Query(None),
|
||||
mapping_type: Optional[str] = Query(None),
|
||||
limit: int = Query(500, ge=1, le=5000),
|
||||
offset: int = Query(0, ge=0),
|
||||
):
|
||||
"""List all TOM ↔ canonical control mappings for tenant/project."""
|
||||
tenant_id = _get_tenant_id(x_tenant_id)
|
||||
|
||||
query = """
|
||||
SELECT m.*, cc.title as canonical_title, cc.severity as canonical_severity
|
||||
FROM tom_control_mappings m
|
||||
LEFT JOIN canonical_controls cc ON cc.id = m.canonical_control_id
|
||||
WHERE m.tenant_id = :tid
|
||||
AND (m.project_id = :pid OR (m.project_id IS NULL AND :pid IS NULL))
|
||||
"""
|
||||
params: dict[str, Any] = {"tid": tenant_id, "pid": project_id}
|
||||
|
||||
if tom_category:
|
||||
query += " AND m.tom_category = :tcat"
|
||||
params["tcat"] = tom_category
|
||||
if mapping_type:
|
||||
query += " AND m.mapping_type = :mtype"
|
||||
params["mtype"] = mapping_type
|
||||
|
||||
query += " ORDER BY m.tom_category, m.canonical_control_code"
|
||||
query += " LIMIT :lim OFFSET :off"
|
||||
params["lim"] = limit
|
||||
params["off"] = offset
|
||||
|
||||
count_query = """
|
||||
SELECT count(*) FROM tom_control_mappings
|
||||
WHERE tenant_id = :tid
|
||||
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
|
||||
"""
|
||||
count_params: dict[str, Any] = {"tid": tenant_id, "pid": project_id}
|
||||
if tom_category:
|
||||
count_query += " AND tom_category = :tcat"
|
||||
count_params["tcat"] = tom_category
|
||||
|
||||
with SessionLocal() as db:
|
||||
rows = db.execute(text(query), params).fetchall()
|
||||
total = db.execute(text(count_query), count_params).scalar()
|
||||
|
||||
mappings = []
|
||||
for r in rows:
|
||||
d = _mapping_row_to_dict(r)
|
||||
d["canonical_title"] = getattr(r, "canonical_title", None)
|
||||
d["canonical_severity"] = getattr(r, "canonical_severity", None)
|
||||
mappings.append(d)
|
||||
|
||||
return {"mappings": mappings, "total": total}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAPPINGS BY TOM CONTROL
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/by-tom/{tom_code}")
|
||||
async def get_mappings_by_tom(
|
||||
tom_code: str,
|
||||
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
||||
project_id: Optional[str] = Query(None),
|
||||
):
|
||||
"""Get all canonical controls mapped to a specific TOM control code or category."""
|
||||
tenant_id = _get_tenant_id(x_tenant_id)
|
||||
|
||||
with SessionLocal() as db:
|
||||
rows = db.execute(
|
||||
text("""
|
||||
SELECT m.*, cc.title as canonical_title, cc.severity as canonical_severity,
|
||||
cc.objective as canonical_objective
|
||||
FROM tom_control_mappings m
|
||||
LEFT JOIN canonical_controls cc ON cc.id = m.canonical_control_id
|
||||
WHERE m.tenant_id = :tid
|
||||
AND (m.project_id = :pid OR (m.project_id IS NULL AND :pid IS NULL))
|
||||
AND (m.tom_control_code = :code OR m.tom_category = :code)
|
||||
ORDER BY m.canonical_control_code
|
||||
"""),
|
||||
{"tid": tenant_id, "pid": project_id, "code": tom_code},
|
||||
).fetchall()
|
||||
|
||||
mappings = []
|
||||
for r in rows:
|
||||
d = _mapping_row_to_dict(r)
|
||||
d["canonical_title"] = getattr(r, "canonical_title", None)
|
||||
d["canonical_severity"] = getattr(r, "canonical_severity", None)
|
||||
d["canonical_objective"] = getattr(r, "canonical_objective", None)
|
||||
mappings.append(d)
|
||||
|
||||
return {"tom_code": tom_code, "mappings": mappings, "total": len(mappings)}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# STATS
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/stats")
|
||||
async def get_mapping_stats(
|
||||
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
||||
project_id: Optional[str] = Query(None),
|
||||
):
|
||||
"""Coverage statistics for TOM ↔ canonical control mappings."""
|
||||
tenant_id = _get_tenant_id(x_tenant_id)
|
||||
|
||||
with SessionLocal() as db:
|
||||
# Sync state
|
||||
sync_state = db.execute(
|
||||
text("""
|
||||
SELECT * FROM tom_control_sync_state
|
||||
WHERE tenant_id = :tid
|
||||
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
|
||||
"""),
|
||||
{"tid": tenant_id, "pid": project_id},
|
||||
).fetchone()
|
||||
|
||||
# Per-category breakdown
|
||||
category_stats = db.execute(
|
||||
text("""
|
||||
SELECT tom_category,
|
||||
count(*) as total_mappings,
|
||||
count(DISTINCT canonical_control_id) as unique_controls,
|
||||
count(*) FILTER (WHERE mapping_type = 'auto') as auto_count,
|
||||
count(*) FILTER (WHERE mapping_type = 'manual') as manual_count
|
||||
FROM tom_control_mappings
|
||||
WHERE tenant_id = :tid
|
||||
AND (project_id = :pid OR (project_id IS NULL AND :pid IS NULL))
|
||||
GROUP BY tom_category
|
||||
ORDER BY tom_category
|
||||
"""),
|
||||
{"tid": tenant_id, "pid": project_id},
|
||||
).fetchall()
|
||||
|
||||
# Total canonical controls in DB (approved + visible)
|
||||
total_canonical = db.execute(
|
||||
text("""
|
||||
SELECT count(*) FROM canonical_controls
|
||||
WHERE release_state = 'approved' AND customer_visible = true
|
||||
""")
|
||||
).scalar()
|
||||
|
||||
return {
|
||||
"sync_state": {
|
||||
"profile_hash": sync_state.profile_hash if sync_state else None,
|
||||
"total_mappings": sync_state.total_mappings if sync_state else 0,
|
||||
"canonical_controls_matched": sync_state.canonical_controls_matched if sync_state else 0,
|
||||
"tom_controls_covered": sync_state.tom_controls_covered if sync_state else 0,
|
||||
"last_synced_at": sync_state.last_synced_at.isoformat() if sync_state and sync_state.last_synced_at else None,
|
||||
},
|
||||
"category_breakdown": [
|
||||
{
|
||||
"tom_category": r.tom_category,
|
||||
"total_mappings": r.total_mappings,
|
||||
"unique_controls": r.unique_controls,
|
||||
"auto_count": r.auto_count,
|
||||
"manual_count": r.manual_count,
|
||||
}
|
||||
for r in category_stats
|
||||
],
|
||||
"total_canonical_controls_available": total_canonical or 0,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MANUAL MAPPING
|
||||
# =============================================================================
|
||||
|
||||
@router.post("/manual", status_code=201)
|
||||
async def add_manual_mapping(
|
||||
body: ManualMappingRequest,
|
||||
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
||||
project_id: Optional[str] = Query(None),
|
||||
):
|
||||
"""Manually add a canonical control to a TOM measure."""
|
||||
tenant_id = _get_tenant_id(x_tenant_id)
|
||||
|
||||
with SessionLocal() as db:
|
||||
# Verify canonical control exists
|
||||
cc = db.execute(
|
||||
text("SELECT id, control_id, category FROM canonical_controls WHERE id = CAST(:cid AS uuid)"),
|
||||
{"cid": body.canonical_control_id},
|
||||
).fetchone()
|
||||
if not cc:
|
||||
raise HTTPException(status_code=404, detail="Canonical control not found")
|
||||
|
||||
try:
|
||||
row = db.execute(
|
||||
text("""
|
||||
INSERT INTO tom_control_mappings (
|
||||
tenant_id, project_id, tom_control_code, tom_category,
|
||||
canonical_control_id, canonical_control_code, canonical_category,
|
||||
mapping_type, relevance_score
|
||||
) VALUES (
|
||||
:tid, :pid, :tom_code, :tom_cat,
|
||||
CAST(:cc_id AS uuid), :cc_code, :cc_category,
|
||||
'manual', :score
|
||||
)
|
||||
RETURNING *
|
||||
"""),
|
||||
{
|
||||
"tid": tenant_id,
|
||||
"pid": project_id,
|
||||
"tom_code": body.tom_control_code,
|
||||
"tom_cat": body.tom_category,
|
||||
"cc_id": body.canonical_control_id,
|
||||
"cc_code": body.canonical_control_code,
|
||||
"cc_category": body.canonical_category or cc.category,
|
||||
"score": body.relevance_score,
|
||||
},
|
||||
).fetchone()
|
||||
db.commit()
|
||||
except Exception as e:
|
||||
if "unique" in str(e).lower() or "duplicate" in str(e).lower():
|
||||
raise HTTPException(status_code=409, detail="Mapping already exists")
|
||||
raise
|
||||
|
||||
return _mapping_row_to_dict(row)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE MAPPING
|
||||
# =============================================================================
|
||||
|
||||
@router.delete("/{mapping_id}", status_code=204)
|
||||
async def delete_mapping(
|
||||
mapping_id: str,
|
||||
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
||||
):
|
||||
"""Remove a mapping (manual or auto)."""
|
||||
tenant_id = _get_tenant_id(x_tenant_id)
|
||||
|
||||
with SessionLocal() as db:
|
||||
result = db.execute(
|
||||
text("""
|
||||
DELETE FROM tom_control_mappings
|
||||
WHERE id = CAST(:mid AS uuid) AND tenant_id = :tid
|
||||
"""),
|
||||
{"mid": mapping_id, "tid": tenant_id},
|
||||
)
|
||||
if result.rowcount == 0:
|
||||
raise HTTPException(status_code=404, detail="Mapping not found")
|
||||
db.commit()
|
||||
|
||||
return None
|
||||
427
backend-compliance/compliance/api/vvt_library_routes.py
Normal file
427
backend-compliance/compliance/api/vvt_library_routes.py
Normal file
@@ -0,0 +1,427 @@
|
||||
"""
|
||||
FastAPI routes for VVT Master Libraries + Process Templates.
|
||||
|
||||
Library endpoints (read-only, global):
|
||||
GET /vvt/libraries — Overview: all library types + counts
|
||||
GET /vvt/libraries/data-subjects — Data subjects (filter: typical_for)
|
||||
GET /vvt/libraries/data-categories — Hierarchical (filter: parent_id, is_art9, flat)
|
||||
GET /vvt/libraries/recipients — Recipients (filter: type)
|
||||
GET /vvt/libraries/legal-bases — Legal bases (filter: is_art9, type)
|
||||
GET /vvt/libraries/retention-rules — Retention rules
|
||||
GET /vvt/libraries/transfer-mechanisms — Transfer mechanisms
|
||||
GET /vvt/libraries/purposes — Purposes (filter: typical_for)
|
||||
GET /vvt/libraries/toms — TOMs (filter: category)
|
||||
|
||||
Template endpoints:
|
||||
GET /vvt/templates — List templates (filter: business_function, search)
|
||||
GET /vvt/templates/{id} — Single template with resolved labels
|
||||
POST /vvt/templates/{id}/instantiate — Create VVT activity from template
|
||||
"""
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, Request
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
|
||||
from ..db.vvt_library_models import (
|
||||
VVTLibDataSubjectDB,
|
||||
VVTLibDataCategoryDB,
|
||||
VVTLibRecipientDB,
|
||||
VVTLibLegalBasisDB,
|
||||
VVTLibRetentionRuleDB,
|
||||
VVTLibTransferMechanismDB,
|
||||
VVTLibPurposeDB,
|
||||
VVTLibTomDB,
|
||||
VVTProcessTemplateDB,
|
||||
)
|
||||
from ..db.vvt_models import VVTActivityDB, VVTAuditLogDB
|
||||
from .tenant_utils import get_tenant_id
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/vvt", tags=["compliance-vvt-libraries"])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Helper: row → dict
|
||||
# ============================================================================
|
||||
|
||||
def _row_to_dict(row, extra_fields=None):
|
||||
"""Generic row → dict for library items."""
|
||||
d = {
|
||||
"id": row.id,
|
||||
"label_de": row.label_de,
|
||||
}
|
||||
if hasattr(row, 'description_de') and row.description_de:
|
||||
d["description_de"] = row.description_de
|
||||
if hasattr(row, 'sort_order'):
|
||||
d["sort_order"] = row.sort_order
|
||||
if extra_fields:
|
||||
for f in extra_fields:
|
||||
if hasattr(row, f):
|
||||
val = getattr(row, f)
|
||||
if val is not None:
|
||||
d[f] = val
|
||||
return d
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Library Overview
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries")
|
||||
async def get_libraries_overview(db: Session = Depends(get_db)):
|
||||
"""Overview of all library types with item counts."""
|
||||
return {
|
||||
"libraries": [
|
||||
{"type": "data-subjects", "count": db.query(VVTLibDataSubjectDB).count()},
|
||||
{"type": "data-categories", "count": db.query(VVTLibDataCategoryDB).count()},
|
||||
{"type": "recipients", "count": db.query(VVTLibRecipientDB).count()},
|
||||
{"type": "legal-bases", "count": db.query(VVTLibLegalBasisDB).count()},
|
||||
{"type": "retention-rules", "count": db.query(VVTLibRetentionRuleDB).count()},
|
||||
{"type": "transfer-mechanisms", "count": db.query(VVTLibTransferMechanismDB).count()},
|
||||
{"type": "purposes", "count": db.query(VVTLibPurposeDB).count()},
|
||||
{"type": "toms", "count": db.query(VVTLibTomDB).count()},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Data Subjects
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries/data-subjects")
|
||||
async def list_data_subjects(
|
||||
typical_for: Optional[str] = Query(None, description="Filter by business function"),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
query = db.query(VVTLibDataSubjectDB).order_by(VVTLibDataSubjectDB.sort_order)
|
||||
rows = query.all()
|
||||
items = [_row_to_dict(r, ["art9_relevant", "typical_for"]) for r in rows]
|
||||
if typical_for:
|
||||
items = [i for i in items if typical_for in (i.get("typical_for") or [])]
|
||||
return items
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Data Categories (hierarchical)
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries/data-categories")
|
||||
async def list_data_categories(
|
||||
flat: Optional[bool] = Query(False, description="Return flat list instead of tree"),
|
||||
parent_id: Optional[str] = Query(None),
|
||||
is_art9: Optional[bool] = Query(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
query = db.query(VVTLibDataCategoryDB).order_by(VVTLibDataCategoryDB.sort_order)
|
||||
if parent_id is not None:
|
||||
query = query.filter(VVTLibDataCategoryDB.parent_id == parent_id)
|
||||
if is_art9 is not None:
|
||||
query = query.filter(VVTLibDataCategoryDB.is_art9 == is_art9)
|
||||
rows = query.all()
|
||||
|
||||
extra = ["parent_id", "is_art9", "is_art10", "risk_weight", "default_retention_rule", "default_legal_basis"]
|
||||
items = [_row_to_dict(r, extra) for r in rows]
|
||||
|
||||
if flat or parent_id is not None or is_art9 is not None:
|
||||
return items
|
||||
|
||||
# Build tree
|
||||
by_parent: dict = {}
|
||||
for item in items:
|
||||
pid = item.get("parent_id")
|
||||
by_parent.setdefault(pid, []).append(item)
|
||||
|
||||
tree = []
|
||||
for item in by_parent.get(None, []):
|
||||
children = by_parent.get(item["id"], [])
|
||||
if children:
|
||||
item["children"] = children
|
||||
tree.append(item)
|
||||
return tree
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Recipients
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries/recipients")
|
||||
async def list_recipients(
|
||||
type: Optional[str] = Query(None, description="INTERNAL, PROCESSOR, CONTROLLER, AUTHORITY"),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
query = db.query(VVTLibRecipientDB).order_by(VVTLibRecipientDB.sort_order)
|
||||
if type:
|
||||
query = query.filter(VVTLibRecipientDB.type == type)
|
||||
rows = query.all()
|
||||
return [_row_to_dict(r, ["type", "is_third_country", "country"]) for r in rows]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Legal Bases
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries/legal-bases")
|
||||
async def list_legal_bases(
|
||||
is_art9: Optional[bool] = Query(None),
|
||||
type: Optional[str] = Query(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
query = db.query(VVTLibLegalBasisDB).order_by(VVTLibLegalBasisDB.sort_order)
|
||||
if is_art9 is not None:
|
||||
query = query.filter(VVTLibLegalBasisDB.is_art9 == is_art9)
|
||||
if type:
|
||||
query = query.filter(VVTLibLegalBasisDB.type == type)
|
||||
rows = query.all()
|
||||
return [_row_to_dict(r, ["article", "type", "is_art9", "typical_national_law"]) for r in rows]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Retention Rules
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries/retention-rules")
|
||||
async def list_retention_rules(db: Session = Depends(get_db)):
|
||||
rows = db.query(VVTLibRetentionRuleDB).order_by(VVTLibRetentionRuleDB.sort_order).all()
|
||||
return [_row_to_dict(r, ["legal_basis", "duration", "duration_unit", "start_event", "deletion_procedure"]) for r in rows]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Transfer Mechanisms
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries/transfer-mechanisms")
|
||||
async def list_transfer_mechanisms(db: Session = Depends(get_db)):
|
||||
rows = db.query(VVTLibTransferMechanismDB).order_by(VVTLibTransferMechanismDB.sort_order).all()
|
||||
return [_row_to_dict(r, ["article", "requires_tia"]) for r in rows]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Purposes
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries/purposes")
|
||||
async def list_purposes(
|
||||
typical_for: Optional[str] = Query(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
rows = db.query(VVTLibPurposeDB).order_by(VVTLibPurposeDB.sort_order).all()
|
||||
items = [_row_to_dict(r, ["typical_legal_basis", "typical_for"]) for r in rows]
|
||||
if typical_for:
|
||||
items = [i for i in items if typical_for in (i.get("typical_for") or [])]
|
||||
return items
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# TOMs
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/libraries/toms")
|
||||
async def list_toms(
|
||||
category: Optional[str] = Query(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
query = db.query(VVTLibTomDB).order_by(VVTLibTomDB.sort_order)
|
||||
if category:
|
||||
query = query.filter(VVTLibTomDB.category == category)
|
||||
rows = query.all()
|
||||
return [_row_to_dict(r, ["category", "art32_reference"]) for r in rows]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Process Templates
|
||||
# ============================================================================
|
||||
|
||||
def _template_to_dict(t: VVTProcessTemplateDB) -> dict:
|
||||
return {
|
||||
"id": t.id,
|
||||
"name": t.name,
|
||||
"description": t.description,
|
||||
"business_function": t.business_function,
|
||||
"purpose_refs": t.purpose_refs or [],
|
||||
"legal_basis_refs": t.legal_basis_refs or [],
|
||||
"data_subject_refs": t.data_subject_refs or [],
|
||||
"data_category_refs": t.data_category_refs or [],
|
||||
"recipient_refs": t.recipient_refs or [],
|
||||
"tom_refs": t.tom_refs or [],
|
||||
"transfer_mechanism_refs": t.transfer_mechanism_refs or [],
|
||||
"retention_rule_ref": t.retention_rule_ref,
|
||||
"typical_systems": t.typical_systems or [],
|
||||
"protection_level": t.protection_level or "MEDIUM",
|
||||
"dpia_required": t.dpia_required or False,
|
||||
"risk_score": t.risk_score,
|
||||
"tags": t.tags or [],
|
||||
"is_system": t.is_system,
|
||||
"sort_order": t.sort_order,
|
||||
}
|
||||
|
||||
|
||||
def _resolve_labels(template_dict: dict, db: Session) -> dict:
|
||||
"""Resolve library IDs to labels within the template dict."""
|
||||
resolvers = {
|
||||
"purpose_refs": (VVTLibPurposeDB, "purpose_labels"),
|
||||
"legal_basis_refs": (VVTLibLegalBasisDB, "legal_basis_labels"),
|
||||
"data_subject_refs": (VVTLibDataSubjectDB, "data_subject_labels"),
|
||||
"data_category_refs": (VVTLibDataCategoryDB, "data_category_labels"),
|
||||
"recipient_refs": (VVTLibRecipientDB, "recipient_labels"),
|
||||
"tom_refs": (VVTLibTomDB, "tom_labels"),
|
||||
"transfer_mechanism_refs": (VVTLibTransferMechanismDB, "transfer_mechanism_labels"),
|
||||
}
|
||||
for refs_key, (model, labels_key) in resolvers.items():
|
||||
ids = template_dict.get(refs_key) or []
|
||||
if ids:
|
||||
rows = db.query(model).filter(model.id.in_(ids)).all()
|
||||
label_map = {r.id: r.label_de for r in rows}
|
||||
template_dict[labels_key] = {rid: label_map.get(rid, rid) for rid in ids}
|
||||
|
||||
# Resolve single retention rule
|
||||
rr = template_dict.get("retention_rule_ref")
|
||||
if rr:
|
||||
row = db.query(VVTLibRetentionRuleDB).filter(VVTLibRetentionRuleDB.id == rr).first()
|
||||
if row:
|
||||
template_dict["retention_rule_label"] = row.label_de
|
||||
|
||||
return template_dict
|
||||
|
||||
|
||||
@router.get("/templates")
|
||||
async def list_templates(
|
||||
business_function: Optional[str] = Query(None),
|
||||
search: Optional[str] = Query(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""List process templates (system + tenant)."""
|
||||
query = db.query(VVTProcessTemplateDB).order_by(VVTProcessTemplateDB.sort_order)
|
||||
if business_function:
|
||||
query = query.filter(VVTProcessTemplateDB.business_function == business_function)
|
||||
if search:
|
||||
term = f"%{search}%"
|
||||
query = query.filter(
|
||||
(VVTProcessTemplateDB.name.ilike(term)) |
|
||||
(VVTProcessTemplateDB.description.ilike(term))
|
||||
)
|
||||
templates = query.all()
|
||||
return [_template_to_dict(t) for t in templates]
|
||||
|
||||
|
||||
@router.get("/templates/{template_id}")
|
||||
async def get_template(
|
||||
template_id: str,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Get a single template with resolved library labels."""
|
||||
t = db.query(VVTProcessTemplateDB).filter(VVTProcessTemplateDB.id == template_id).first()
|
||||
if not t:
|
||||
raise HTTPException(status_code=404, detail=f"Template '{template_id}' not found")
|
||||
result = _template_to_dict(t)
|
||||
return _resolve_labels(result, db)
|
||||
|
||||
|
||||
@router.post("/templates/{template_id}/instantiate", status_code=201)
|
||||
async def instantiate_template(
|
||||
template_id: str,
|
||||
http_request: Request,
|
||||
tid: str = Depends(get_tenant_id),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Create a new VVT activity from a process template."""
|
||||
t = db.query(VVTProcessTemplateDB).filter(VVTProcessTemplateDB.id == template_id).first()
|
||||
if not t:
|
||||
raise HTTPException(status_code=404, detail=f"Template '{template_id}' not found")
|
||||
|
||||
# Generate unique VVT-ID
|
||||
count = db.query(VVTActivityDB).filter(VVTActivityDB.tenant_id == tid).count()
|
||||
vvt_id = f"VVT-{count + 1:04d}"
|
||||
|
||||
# Resolve library IDs to freetext labels for backward-compat fields
|
||||
purpose_labels = _resolve_ids(db, VVTLibPurposeDB, t.purpose_refs or [])
|
||||
legal_labels = _resolve_ids(db, VVTLibLegalBasisDB, t.legal_basis_refs or [])
|
||||
subject_labels = _resolve_ids(db, VVTLibDataSubjectDB, t.data_subject_refs or [])
|
||||
category_labels = _resolve_ids(db, VVTLibDataCategoryDB, t.data_category_refs or [])
|
||||
recipient_labels = _resolve_ids(db, VVTLibRecipientDB, t.recipient_refs or [])
|
||||
|
||||
# Resolve retention rule
|
||||
retention_period = {}
|
||||
if t.retention_rule_ref:
|
||||
rr = db.query(VVTLibRetentionRuleDB).filter(VVTLibRetentionRuleDB.id == t.retention_rule_ref).first()
|
||||
if rr:
|
||||
retention_period = {
|
||||
"description": rr.label_de,
|
||||
"legalBasis": rr.legal_basis or "",
|
||||
"deletionProcedure": rr.deletion_procedure or "",
|
||||
"duration": rr.duration,
|
||||
"durationUnit": rr.duration_unit,
|
||||
}
|
||||
|
||||
# Build structured TOMs from tom_refs
|
||||
structured_toms = {"accessControl": [], "confidentiality": [], "integrity": [], "availability": [], "separation": []}
|
||||
if t.tom_refs:
|
||||
tom_rows = db.query(VVTLibTomDB).filter(VVTLibTomDB.id.in_(t.tom_refs)).all()
|
||||
for tr in tom_rows:
|
||||
cat = tr.category
|
||||
if cat in structured_toms:
|
||||
structured_toms[cat].append(tr.label_de)
|
||||
|
||||
act = VVTActivityDB(
|
||||
tenant_id=tid,
|
||||
vvt_id=vvt_id,
|
||||
name=t.name,
|
||||
description=t.description or "",
|
||||
purposes=purpose_labels,
|
||||
legal_bases=[{"type": lid, "description": lbl} for lid, lbl in zip(t.legal_basis_refs or [], legal_labels)],
|
||||
data_subject_categories=subject_labels,
|
||||
personal_data_categories=category_labels,
|
||||
recipient_categories=[{"type": "unknown", "name": lbl} for lbl in recipient_labels],
|
||||
retention_period=retention_period,
|
||||
business_function=t.business_function,
|
||||
systems=[{"systemId": s, "name": s} for s in (t.typical_systems or [])],
|
||||
protection_level=t.protection_level or "MEDIUM",
|
||||
dpia_required=t.dpia_required or False,
|
||||
structured_toms=structured_toms,
|
||||
status="DRAFT",
|
||||
created_by=http_request.headers.get("X-User-ID", "system"),
|
||||
# Library refs
|
||||
purpose_refs=t.purpose_refs,
|
||||
legal_basis_refs=t.legal_basis_refs,
|
||||
data_subject_refs=t.data_subject_refs,
|
||||
data_category_refs=t.data_category_refs,
|
||||
recipient_refs=t.recipient_refs,
|
||||
retention_rule_ref=t.retention_rule_ref,
|
||||
transfer_mechanism_refs=t.transfer_mechanism_refs,
|
||||
tom_refs=t.tom_refs,
|
||||
source_template_id=t.id,
|
||||
risk_score=t.risk_score,
|
||||
)
|
||||
db.add(act)
|
||||
db.flush()
|
||||
|
||||
# Audit log
|
||||
audit = VVTAuditLogDB(
|
||||
tenant_id=tid,
|
||||
action="CREATE",
|
||||
entity_type="activity",
|
||||
entity_id=act.id,
|
||||
changed_by=http_request.headers.get("X-User-ID", "system"),
|
||||
new_values={"vvt_id": vvt_id, "source_template_id": t.id, "name": t.name},
|
||||
)
|
||||
db.add(audit)
|
||||
db.commit()
|
||||
db.refresh(act)
|
||||
|
||||
# Return full response
|
||||
from .vvt_routes import _activity_to_response
|
||||
return _activity_to_response(act)
|
||||
|
||||
|
||||
def _resolve_ids(db: Session, model, ids: list) -> list:
|
||||
"""Resolve list of library IDs to list of label_de strings."""
|
||||
if not ids:
|
||||
return []
|
||||
rows = db.query(model).filter(model.id.in_(ids)).all()
|
||||
label_map = {r.id: r.label_de for r in rows}
|
||||
return [label_map.get(i, i) for i in ids]
|
||||
@@ -81,6 +81,54 @@ async def upsert_organization(
|
||||
# Activities
|
||||
# ============================================================================
|
||||
|
||||
def _activity_to_response(act: VVTActivityDB) -> VVTActivityResponse:
|
||||
return VVTActivityResponse(
|
||||
id=str(act.id),
|
||||
vvt_id=act.vvt_id,
|
||||
name=act.name,
|
||||
description=act.description,
|
||||
purposes=act.purposes or [],
|
||||
legal_bases=act.legal_bases or [],
|
||||
data_subject_categories=act.data_subject_categories or [],
|
||||
personal_data_categories=act.personal_data_categories or [],
|
||||
recipient_categories=act.recipient_categories or [],
|
||||
third_country_transfers=act.third_country_transfers or [],
|
||||
retention_period=act.retention_period or {},
|
||||
tom_description=act.tom_description,
|
||||
business_function=act.business_function,
|
||||
systems=act.systems or [],
|
||||
deployment_model=act.deployment_model,
|
||||
data_sources=act.data_sources or [],
|
||||
data_flows=act.data_flows or [],
|
||||
protection_level=act.protection_level or 'MEDIUM',
|
||||
dpia_required=act.dpia_required or False,
|
||||
structured_toms=act.structured_toms or {},
|
||||
status=act.status or 'DRAFT',
|
||||
responsible=act.responsible,
|
||||
owner=act.owner,
|
||||
last_reviewed_at=act.last_reviewed_at,
|
||||
next_review_at=act.next_review_at,
|
||||
created_by=act.created_by,
|
||||
dsfa_id=str(act.dsfa_id) if act.dsfa_id else None,
|
||||
# Library refs
|
||||
purpose_refs=act.purpose_refs,
|
||||
legal_basis_refs=act.legal_basis_refs,
|
||||
data_subject_refs=act.data_subject_refs,
|
||||
data_category_refs=act.data_category_refs,
|
||||
recipient_refs=act.recipient_refs,
|
||||
retention_rule_ref=act.retention_rule_ref,
|
||||
transfer_mechanism_refs=act.transfer_mechanism_refs,
|
||||
tom_refs=act.tom_refs,
|
||||
source_template_id=act.source_template_id,
|
||||
risk_score=act.risk_score,
|
||||
linked_loeschfristen_ids=act.linked_loeschfristen_ids,
|
||||
linked_tom_measure_ids=act.linked_tom_measure_ids,
|
||||
art30_completeness=act.art30_completeness,
|
||||
created_at=act.created_at,
|
||||
updated_at=act.updated_at,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/activities", response_model=List[VVTActivityResponse])
|
||||
async def list_activities(
|
||||
status: Optional[str] = Query(None),
|
||||
@@ -145,6 +193,107 @@ async def delete_activity(
|
||||
return service.delete_activity(tid, activity_id)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Art. 30 Completeness Check
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/activities/{activity_id}/completeness")
|
||||
async def get_activity_completeness(
|
||||
activity_id: str,
|
||||
tid: str = Depends(get_tenant_id),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Calculate Art. 30 completeness score for a VVT activity."""
|
||||
act = db.query(VVTActivityDB).filter(
|
||||
VVTActivityDB.id == activity_id,
|
||||
VVTActivityDB.tenant_id == tid,
|
||||
).first()
|
||||
if not act:
|
||||
raise HTTPException(status_code=404, detail=f"Activity {activity_id} not found")
|
||||
return _calculate_completeness(act)
|
||||
|
||||
|
||||
def _calculate_completeness(act: VVTActivityDB) -> dict:
|
||||
"""Calculate Art. 30 completeness — required fields per DSGVO Art. 30 Abs. 1."""
|
||||
missing = []
|
||||
warnings = []
|
||||
total_checks = 10
|
||||
passed = 0
|
||||
|
||||
# 1. Name/Zweck
|
||||
if act.name:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("name")
|
||||
|
||||
# 2. Verarbeitungszwecke
|
||||
has_purposes = bool(act.purposes) or bool(act.purpose_refs)
|
||||
if has_purposes:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("purposes")
|
||||
|
||||
# 3. Rechtsgrundlage
|
||||
has_legal = bool(act.legal_bases) or bool(act.legal_basis_refs)
|
||||
if has_legal:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("legal_bases")
|
||||
|
||||
# 4. Betroffenenkategorien
|
||||
has_subjects = bool(act.data_subject_categories) or bool(act.data_subject_refs)
|
||||
if has_subjects:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("data_subjects")
|
||||
|
||||
# 5. Datenkategorien
|
||||
has_categories = bool(act.personal_data_categories) or bool(act.data_category_refs)
|
||||
if has_categories:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("data_categories")
|
||||
|
||||
# 6. Empfaenger
|
||||
has_recipients = bool(act.recipient_categories) or bool(act.recipient_refs)
|
||||
if has_recipients:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("recipients")
|
||||
|
||||
# 7. Drittland-Uebermittlung (checked but not strictly required)
|
||||
passed += 1 # always passes — no transfer is valid state
|
||||
|
||||
# 8. Loeschfristen
|
||||
has_retention = bool(act.retention_period and act.retention_period.get('description')) or bool(act.retention_rule_ref)
|
||||
if has_retention:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("retention_period")
|
||||
|
||||
# 9. TOM-Beschreibung
|
||||
has_tom = bool(act.tom_description) or bool(act.tom_refs) or bool(act.structured_toms)
|
||||
if has_tom:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("tom_description")
|
||||
|
||||
# 10. Verantwortlicher
|
||||
if act.responsible:
|
||||
passed += 1
|
||||
else:
|
||||
missing.append("responsible")
|
||||
|
||||
# Warnings
|
||||
if act.dpia_required and not act.dsfa_id:
|
||||
warnings.append("dpia_required_but_no_dsfa_linked")
|
||||
if act.third_country_transfers and not act.transfer_mechanism_refs:
|
||||
warnings.append("third_country_transfer_without_mechanism")
|
||||
|
||||
score = int((passed / total_checks) * 100)
|
||||
return {"score": score, "missing": missing, "warnings": warnings, "passed": passed, "total": total_checks}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Audit Log
|
||||
# ============================================================================
|
||||
|
||||
443
backend-compliance/compliance/data/frameworks/csa_ccm.json
Normal file
443
backend-compliance/compliance/data/frameworks/csa_ccm.json
Normal file
@@ -0,0 +1,443 @@
|
||||
{
|
||||
"framework_id": "CSA_CCM",
|
||||
"display_name": "Cloud Security Alliance CCM v4",
|
||||
"license": {
|
||||
"type": "restricted",
|
||||
"rag_allowed": false,
|
||||
"use_as_metadata": true,
|
||||
"note": "Abstrahierte Struktur — keine Originaltexte uebernommen"
|
||||
},
|
||||
"domains": [
|
||||
{
|
||||
"domain_id": "AIS",
|
||||
"title": "Application and Interface Security",
|
||||
"aliases": ["ais", "application and interface security", "anwendungssicherheit", "schnittstellensicherheit"],
|
||||
"keywords": ["application", "anwendung", "interface", "schnittstelle", "api", "web", "eingabevalidierung"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "AIS-01",
|
||||
"title": "Application Security Policy",
|
||||
"statement": "Sicherheitsrichtlinien fuer Anwendungsentwicklung und Schnittstellenmanagement muessen definiert und angewendet werden.",
|
||||
"keywords": ["policy", "richtlinie", "entwicklung"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Anwendungssicherheitsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AIS-02",
|
||||
"title": "Application Security Design",
|
||||
"statement": "Sicherheitsanforderungen muessen in den Entwurf jeder Anwendung integriert werden.",
|
||||
"keywords": ["design", "entwurf", "security by design"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Sicherheitsanforderungen im Anwendungsentwurf",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AIS-03",
|
||||
"title": "Application Security Testing",
|
||||
"statement": "Anwendungen muessen vor dem Deployment und regelmaessig auf Sicherheitsschwachstellen getestet werden.",
|
||||
"keywords": ["testing", "test", "sast", "dast", "penetration"],
|
||||
"action_hint": "test",
|
||||
"object_hint": "Anwendungssicherheitstests",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AIS-04",
|
||||
"title": "Secure Development Practices",
|
||||
"statement": "Sichere Entwicklungspraktiken (Code Review, Pair Programming, SAST) muessen fuer alle Entwicklungsprojekte gelten.",
|
||||
"keywords": ["development", "entwicklung", "code review", "sast", "praktiken"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Sichere Entwicklungspraktiken",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AIS-05",
|
||||
"title": "API Security",
|
||||
"statement": "APIs muessen authentifiziert, autorisiert und gegen Missbrauch geschuetzt werden.",
|
||||
"keywords": ["api", "schnittstelle", "authentifizierung", "rate limiting"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "API-Sicherheitskontrollen",
|
||||
"object_class": "interface"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AIS-06",
|
||||
"title": "Automated Application Security Testing",
|
||||
"statement": "Automatisierte Sicherheitstests muessen in die CI/CD-Pipeline integriert werden.",
|
||||
"keywords": ["automatisiert", "ci/cd", "pipeline", "sast", "dast"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Automatisierte Sicherheitstests in CI/CD",
|
||||
"object_class": "configuration"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "BCR",
|
||||
"title": "Business Continuity and Resilience",
|
||||
"aliases": ["bcr", "business continuity", "resilience", "geschaeftskontinuitaet", "resilienz"],
|
||||
"keywords": ["continuity", "kontinuitaet", "resilience", "resilienz", "disaster", "recovery", "backup"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "BCR-01",
|
||||
"title": "Business Continuity Planning",
|
||||
"statement": "Ein Geschaeftskontinuitaetsplan muss erstellt, dokumentiert und regelmaessig getestet werden.",
|
||||
"keywords": ["plan", "kontinuitaet", "geschaeft"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Geschaeftskontinuitaetsplan",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "BCR-02",
|
||||
"title": "Risk Assessment for BCM",
|
||||
"statement": "Risikobewertungen muessen fuer geschaeftskritische Prozesse durchgefuehrt werden.",
|
||||
"keywords": ["risiko", "bewertung", "kritisch"],
|
||||
"action_hint": "assess",
|
||||
"object_hint": "BCM-Risikobewertung",
|
||||
"object_class": "risk_artifact"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "BCR-03",
|
||||
"title": "Backup and Recovery",
|
||||
"statement": "Datensicherungen muessen regelmaessig erstellt und Wiederherstellungstests durchgefuehrt werden.",
|
||||
"keywords": ["backup", "sicherung", "wiederherstellung", "recovery"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Datensicherung und Wiederherstellung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "BCR-04",
|
||||
"title": "Disaster Recovery Planning",
|
||||
"statement": "Ein Disaster-Recovery-Plan muss dokumentiert und jaehrlich getestet werden.",
|
||||
"keywords": ["disaster", "recovery", "katastrophe"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Disaster-Recovery-Plan",
|
||||
"object_class": "policy"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "CCC",
|
||||
"title": "Change Control and Configuration Management",
|
||||
"aliases": ["ccc", "change control", "configuration management", "aenderungsmanagement", "konfigurationsmanagement"],
|
||||
"keywords": ["change", "aenderung", "konfiguration", "configuration", "release", "deployment"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "CCC-01",
|
||||
"title": "Change Management Policy",
|
||||
"statement": "Ein Aenderungsmanagement-Prozess muss definiert und fuer alle Aenderungen angewendet werden.",
|
||||
"keywords": ["policy", "richtlinie", "aenderung"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Aenderungsmanagement-Richtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "CCC-02",
|
||||
"title": "Change Testing",
|
||||
"statement": "Aenderungen muessen vor der Produktivsetzung getestet und genehmigt werden.",
|
||||
"keywords": ["test", "genehmigung", "approval"],
|
||||
"action_hint": "test",
|
||||
"object_hint": "Aenderungstests",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "CCC-03",
|
||||
"title": "Configuration Baseline",
|
||||
"statement": "Basiskonfigurationen fuer alle Systeme muessen definiert und dokumentiert werden.",
|
||||
"keywords": ["baseline", "basis", "standard"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Konfigurationsbaseline",
|
||||
"object_class": "configuration"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "CEK",
|
||||
"title": "Cryptography, Encryption and Key Management",
|
||||
"aliases": ["cek", "cryptography", "encryption", "key management", "kryptographie", "verschluesselung", "schluesselverwaltung"],
|
||||
"keywords": ["kryptographie", "verschluesselung", "schluessel", "key", "encryption", "certificate", "zertifikat"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "CEK-01",
|
||||
"title": "Encryption Policy",
|
||||
"statement": "Verschluesselungsrichtlinien muessen definiert werden, die Algorithmen, Schluessellaengen und Einsatzbereiche festlegen.",
|
||||
"keywords": ["policy", "richtlinie", "algorithmus"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Verschluesselungsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "CEK-02",
|
||||
"title": "Key Management",
|
||||
"statement": "Kryptographische Schluessel muessen ueber ihren Lebenszyklus sicher verwaltet werden.",
|
||||
"keywords": ["key", "schluessel", "management", "lebenszyklus"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Schluesselverwaltung",
|
||||
"object_class": "cryptographic_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "CEK-03",
|
||||
"title": "Data Encryption",
|
||||
"statement": "Sensible Daten muessen bei Speicherung und Uebertragung verschluesselt werden.",
|
||||
"keywords": ["data", "daten", "speicherung", "uebertragung"],
|
||||
"action_hint": "encrypt",
|
||||
"object_hint": "Datenverschluesselung",
|
||||
"object_class": "cryptographic_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "DSP",
|
||||
"title": "Data Security and Privacy",
|
||||
"aliases": ["dsp", "data security", "privacy", "datensicherheit", "datenschutz"],
|
||||
"keywords": ["datenschutz", "datensicherheit", "privacy", "data security", "pii", "personenbezogen", "dsgvo"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "DSP-01",
|
||||
"title": "Data Classification",
|
||||
"statement": "Daten muessen nach Sensibilitaet klassifiziert und entsprechend geschuetzt werden.",
|
||||
"keywords": ["klassifizierung", "sensibilitaet", "classification"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Datenklassifizierung",
|
||||
"object_class": "data"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "DSP-02",
|
||||
"title": "Data Inventory",
|
||||
"statement": "Ein Dateninventar muss gefuehrt werden, das alle Verarbeitungen personenbezogener Daten dokumentiert.",
|
||||
"keywords": ["inventar", "verzeichnis", "verarbeitung", "vvt"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Dateninventar",
|
||||
"object_class": "register"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "DSP-03",
|
||||
"title": "Data Retention and Deletion",
|
||||
"statement": "Aufbewahrungsfristen muessen definiert und Daten nach Ablauf sicher geloescht werden.",
|
||||
"keywords": ["retention", "aufbewahrung", "loeschung", "frist"],
|
||||
"action_hint": "delete",
|
||||
"object_hint": "Datenloeschung nach Frist",
|
||||
"object_class": "data"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "DSP-04",
|
||||
"title": "Privacy Impact Assessment",
|
||||
"statement": "Datenschutz-Folgenabschaetzungen muessen fuer risikoreiche Verarbeitungen durchgefuehrt werden.",
|
||||
"keywords": ["dsfa", "pia", "folgenabschaetzung", "impact"],
|
||||
"action_hint": "assess",
|
||||
"object_hint": "Datenschutz-Folgenabschaetzung",
|
||||
"object_class": "risk_artifact"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "DSP-05",
|
||||
"title": "Data Subject Rights",
|
||||
"statement": "Verfahren zur Bearbeitung von Betroffenenrechten muessen implementiert werden.",
|
||||
"keywords": ["betroffenenrechte", "auskunft", "loeschung", "data subject"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Betroffenenrechte-Verfahren",
|
||||
"object_class": "process"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "GRC",
|
||||
"title": "Governance, Risk and Compliance",
|
||||
"aliases": ["grc", "governance", "risk", "compliance", "risikomanagement"],
|
||||
"keywords": ["governance", "risiko", "compliance", "management", "policy", "richtlinie"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "GRC-01",
|
||||
"title": "Information Security Program",
|
||||
"statement": "Ein umfassendes Informationssicherheitsprogramm muss etabliert und aufrechterhalten werden.",
|
||||
"keywords": ["programm", "sicherheit", "information"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Informationssicherheitsprogramm",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "GRC-02",
|
||||
"title": "Risk Management Program",
|
||||
"statement": "Ein Risikomanagement-Programm muss implementiert werden, das Identifikation, Bewertung und Behandlung umfasst.",
|
||||
"keywords": ["risiko", "management", "bewertung", "behandlung"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Risikomanagement-Programm",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "GRC-03",
|
||||
"title": "Compliance Monitoring",
|
||||
"statement": "Die Einhaltung regulatorischer und vertraglicher Anforderungen muss ueberwacht werden.",
|
||||
"keywords": ["compliance", "einhaltung", "regulatorisch", "ueberwachung"],
|
||||
"action_hint": "monitor",
|
||||
"object_hint": "Compliance-Ueberwachung",
|
||||
"object_class": "process"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "IAM",
|
||||
"title": "Identity and Access Management",
|
||||
"aliases": ["iam", "identity", "access management", "identitaetsmanagement", "zugriffsverwaltung"],
|
||||
"keywords": ["identitaet", "zugriff", "identity", "access", "authentifizierung", "autorisierung", "sso"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "IAM-01",
|
||||
"title": "Identity and Access Policy",
|
||||
"statement": "Identitaets- und Zugriffsmanagement-Richtlinien muessen definiert werden.",
|
||||
"keywords": ["policy", "richtlinie"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "IAM-Richtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IAM-02",
|
||||
"title": "Strong Authentication",
|
||||
"statement": "Starke Authentifizierung (MFA) muss fuer administrative und sicherheitskritische Zugriffe gefordert werden.",
|
||||
"keywords": ["mfa", "stark", "authentifizierung", "admin"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Starke Authentifizierung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IAM-03",
|
||||
"title": "Identity Lifecycle Management",
|
||||
"statement": "Identitaeten muessen ueber ihren gesamten Lebenszyklus verwaltet werden.",
|
||||
"keywords": ["lifecycle", "lebenszyklus", "onboarding", "offboarding"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Identitaets-Lebenszyklus",
|
||||
"object_class": "account"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IAM-04",
|
||||
"title": "Access Review",
|
||||
"statement": "Zugriffsrechte muessen regelmaessig ueberprueft und ueberschuessige Rechte entzogen werden.",
|
||||
"keywords": ["review", "ueberpruefen", "rechte", "rezertifizierung"],
|
||||
"action_hint": "review",
|
||||
"object_hint": "Zugriffsrechte-Review",
|
||||
"object_class": "access_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "LOG",
|
||||
"title": "Logging and Monitoring",
|
||||
"aliases": ["log", "logging", "monitoring", "protokollierung", "ueberwachung"],
|
||||
"keywords": ["logging", "monitoring", "protokollierung", "ueberwachung", "siem", "alarm"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "LOG-01",
|
||||
"title": "Logging Policy",
|
||||
"statement": "Protokollierungs-Richtlinien muessen definiert werden, die Umfang und Aufbewahrung festlegen.",
|
||||
"keywords": ["policy", "richtlinie", "umfang", "aufbewahrung"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Protokollierungsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "LOG-02",
|
||||
"title": "Security Event Logging",
|
||||
"statement": "Sicherheitsrelevante Ereignisse muessen erfasst und zentral gespeichert werden.",
|
||||
"keywords": ["event", "ereignis", "sicherheit", "zentral"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Sicherheits-Event-Logging",
|
||||
"object_class": "configuration"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "LOG-03",
|
||||
"title": "Monitoring and Alerting",
|
||||
"statement": "Sicherheitsrelevante Logs muessen ueberwacht und bei Anomalien Alarme ausgeloest werden.",
|
||||
"keywords": ["monitoring", "alerting", "alarm", "anomalie"],
|
||||
"action_hint": "monitor",
|
||||
"object_hint": "Log-Ueberwachung und Alarmierung",
|
||||
"object_class": "technical_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "SEF",
|
||||
"title": "Security Incident Management",
|
||||
"aliases": ["sef", "security incident", "incident management", "vorfallmanagement", "sicherheitsvorfall"],
|
||||
"keywords": ["vorfall", "incident", "sicherheitsvorfall", "reaktion", "response", "meldung"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "SEF-01",
|
||||
"title": "Incident Management Policy",
|
||||
"statement": "Ein Vorfallmanagement-Prozess muss definiert, dokumentiert und getestet werden.",
|
||||
"keywords": ["policy", "richtlinie", "prozess"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Vorfallmanagement-Richtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SEF-02",
|
||||
"title": "Incident Response Team",
|
||||
"statement": "Ein Incident-Response-Team muss benannt und geschult werden.",
|
||||
"keywords": ["team", "response", "schulung"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Incident-Response-Team",
|
||||
"object_class": "role"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SEF-03",
|
||||
"title": "Incident Reporting",
|
||||
"statement": "Sicherheitsvorfaelle muessen innerhalb definierter Fristen an zustaendige Stellen gemeldet werden.",
|
||||
"keywords": ["reporting", "meldung", "frist", "behoerde"],
|
||||
"action_hint": "report",
|
||||
"object_hint": "Vorfallmeldung",
|
||||
"object_class": "incident"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SEF-04",
|
||||
"title": "Incident Lessons Learned",
|
||||
"statement": "Nach jedem Vorfall muss eine Nachbereitung mit Lessons Learned durchgefuehrt werden.",
|
||||
"keywords": ["lessons learned", "nachbereitung", "verbesserung"],
|
||||
"action_hint": "review",
|
||||
"object_hint": "Vorfall-Nachbereitung",
|
||||
"object_class": "record"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "TVM",
|
||||
"title": "Threat and Vulnerability Management",
|
||||
"aliases": ["tvm", "threat", "vulnerability", "schwachstelle", "bedrohung", "schwachstellenmanagement"],
|
||||
"keywords": ["schwachstelle", "vulnerability", "threat", "bedrohung", "patch", "scan"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "TVM-01",
|
||||
"title": "Vulnerability Management Policy",
|
||||
"statement": "Schwachstellenmanagement-Richtlinien muessen definiert und umgesetzt werden.",
|
||||
"keywords": ["policy", "richtlinie"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Schwachstellenmanagement-Richtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "TVM-02",
|
||||
"title": "Vulnerability Scanning",
|
||||
"statement": "Systeme muessen regelmaessig auf Schwachstellen gescannt werden.",
|
||||
"keywords": ["scan", "scanning", "regelmaessig"],
|
||||
"action_hint": "test",
|
||||
"object_hint": "Schwachstellenscan",
|
||||
"object_class": "system"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "TVM-03",
|
||||
"title": "Vulnerability Remediation",
|
||||
"statement": "Erkannte Schwachstellen muessen priorisiert und innerhalb definierter Fristen behoben werden.",
|
||||
"keywords": ["remediation", "behebung", "frist", "priorisierung"],
|
||||
"action_hint": "remediate",
|
||||
"object_hint": "Schwachstellenbehebung",
|
||||
"object_class": "system"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "TVM-04",
|
||||
"title": "Penetration Testing",
|
||||
"statement": "Regelmaessige Penetrationstests muessen durchgefuehrt werden.",
|
||||
"keywords": ["penetration", "pentest", "test"],
|
||||
"action_hint": "test",
|
||||
"object_hint": "Penetrationstest",
|
||||
"object_class": "system"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
514
backend-compliance/compliance/data/frameworks/nist_sp800_53.json
Normal file
514
backend-compliance/compliance/data/frameworks/nist_sp800_53.json
Normal file
@@ -0,0 +1,514 @@
|
||||
{
|
||||
"framework_id": "NIST_SP800_53",
|
||||
"display_name": "NIST SP 800-53 Rev. 5",
|
||||
"license": {
|
||||
"type": "public_domain",
|
||||
"rag_allowed": true,
|
||||
"use_as_metadata": true
|
||||
},
|
||||
"domains": [
|
||||
{
|
||||
"domain_id": "AC",
|
||||
"title": "Access Control",
|
||||
"aliases": ["access control", "zugriffskontrolle", "zugriffssteuerung"],
|
||||
"keywords": ["access", "zugriff", "berechtigung", "authorization", "autorisierung"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "AC-1",
|
||||
"title": "Access Control Policy and Procedures",
|
||||
"statement": "Zugriffskontrollrichtlinien und -verfahren muessen definiert, dokumentiert und regelmaessig ueberprueft werden.",
|
||||
"keywords": ["policy", "richtlinie", "verfahren", "procedures"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Zugriffskontrollrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AC-2",
|
||||
"title": "Account Management",
|
||||
"statement": "Benutzerkonten muessen ueber ihren gesamten Lebenszyklus verwaltet werden: Erstellung, Aktivierung, Aenderung, Deaktivierung und Loeschung.",
|
||||
"keywords": ["account", "konto", "benutzer", "lifecycle", "lebenszyklus"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Benutzerkontenverwaltung",
|
||||
"object_class": "account"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AC-3",
|
||||
"title": "Access Enforcement",
|
||||
"statement": "Der Zugriff auf Systemressourcen muss gemaess der definierten Zugriffskontrollrichtlinie durchgesetzt werden.",
|
||||
"keywords": ["enforcement", "durchsetzung", "ressourcen", "system"],
|
||||
"action_hint": "restrict_access",
|
||||
"object_hint": "Zugriffsdurchsetzung",
|
||||
"object_class": "access_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AC-5",
|
||||
"title": "Separation of Duties",
|
||||
"statement": "Aufgabentrennung muss definiert und durchgesetzt werden, um Interessenkonflikte und Missbrauch zu verhindern.",
|
||||
"keywords": ["separation", "trennung", "duties", "aufgaben", "funktionstrennung"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Aufgabentrennung",
|
||||
"object_class": "role"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AC-6",
|
||||
"title": "Least Privilege",
|
||||
"statement": "Zugriffsrechte muessen nach dem Prinzip der minimalen Rechte vergeben werden.",
|
||||
"keywords": ["least privilege", "minimal", "rechte", "privileg"],
|
||||
"action_hint": "restrict_access",
|
||||
"object_hint": "Minimale Rechtevergabe",
|
||||
"object_class": "access_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AC-7",
|
||||
"title": "Unsuccessful Logon Attempts",
|
||||
"statement": "Fehlgeschlagene Anmeldeversuche muessen begrenzt und ueberwacht werden.",
|
||||
"keywords": ["logon", "anmeldung", "fehlgeschlagen", "sperre", "lockout"],
|
||||
"action_hint": "monitor",
|
||||
"object_hint": "Anmeldeversuchsueberwachung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AC-17",
|
||||
"title": "Remote Access",
|
||||
"statement": "Fernzugriff muss autorisiert, ueberwacht und verschluesselt werden.",
|
||||
"keywords": ["remote", "fern", "vpn", "fernzugriff"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Fernzugriffskonfiguration",
|
||||
"object_class": "technical_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "AU",
|
||||
"title": "Audit and Accountability",
|
||||
"aliases": ["audit", "protokollierung", "accountability", "rechenschaftspflicht"],
|
||||
"keywords": ["audit", "log", "protokoll", "nachvollziehbarkeit", "logging"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "AU-1",
|
||||
"title": "Audit Policy and Procedures",
|
||||
"statement": "Audit- und Protokollierungsrichtlinien muessen definiert und regelmaessig ueberprueft werden.",
|
||||
"keywords": ["policy", "richtlinie", "audit"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Auditrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AU-2",
|
||||
"title": "Event Logging",
|
||||
"statement": "Sicherheitsrelevante Ereignisse muessen identifiziert und protokolliert werden.",
|
||||
"keywords": ["event", "ereignis", "logging", "protokollierung"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Ereignisprotokollierung",
|
||||
"object_class": "configuration"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AU-3",
|
||||
"title": "Content of Audit Records",
|
||||
"statement": "Audit-Eintraege muessen ausreichende Informationen enthalten: Zeitstempel, Quelle, Ergebnis, Identitaet.",
|
||||
"keywords": ["content", "inhalt", "record", "eintrag"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Audit-Eintragsformat",
|
||||
"object_class": "record"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AU-6",
|
||||
"title": "Audit Record Review and Reporting",
|
||||
"statement": "Audit-Eintraege muessen regelmaessig ueberprueft und bei Anomalien berichtet werden.",
|
||||
"keywords": ["review", "ueberpruefen", "reporting", "anomalie"],
|
||||
"action_hint": "review",
|
||||
"object_hint": "Audit-Ueberpruefung",
|
||||
"object_class": "record"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AU-9",
|
||||
"title": "Protection of Audit Information",
|
||||
"statement": "Audit-Daten muessen vor unbefugtem Zugriff, Aenderung und Loeschung geschuetzt werden.",
|
||||
"keywords": ["schutz", "protection", "integritaet", "integrity"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Audit-Datenschutz",
|
||||
"object_class": "technical_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "AT",
|
||||
"title": "Awareness and Training",
|
||||
"aliases": ["awareness", "training", "schulung", "sensibilisierung"],
|
||||
"keywords": ["training", "schulung", "awareness", "sensibilisierung", "weiterbildung"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "AT-1",
|
||||
"title": "Policy and Procedures",
|
||||
"statement": "Schulungs- und Sensibilisierungsrichtlinien muessen definiert und regelmaessig aktualisiert werden.",
|
||||
"keywords": ["policy", "richtlinie"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Schulungsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AT-2",
|
||||
"title": "Literacy Training and Awareness",
|
||||
"statement": "Alle Mitarbeiter muessen regelmaessig Sicherheitsschulungen erhalten.",
|
||||
"keywords": ["mitarbeiter", "schulung", "sicherheit"],
|
||||
"action_hint": "train",
|
||||
"object_hint": "Sicherheitsschulung",
|
||||
"object_class": "training"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "AT-3",
|
||||
"title": "Role-Based Training",
|
||||
"statement": "Rollenbasierte Sicherheitsschulungen muessen fuer Mitarbeiter mit besonderen Sicherheitsaufgaben durchgefuehrt werden.",
|
||||
"keywords": ["rollenbasiert", "role-based", "speziell"],
|
||||
"action_hint": "train",
|
||||
"object_hint": "Rollenbasierte Sicherheitsschulung",
|
||||
"object_class": "training"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "CM",
|
||||
"title": "Configuration Management",
|
||||
"aliases": ["configuration management", "konfigurationsmanagement", "konfiguration"],
|
||||
"keywords": ["konfiguration", "configuration", "baseline", "haertung", "hardening"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "CM-1",
|
||||
"title": "Policy and Procedures",
|
||||
"statement": "Konfigurationsmanagement-Richtlinien muessen dokumentiert und gepflegt werden.",
|
||||
"keywords": ["policy", "richtlinie"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Konfigurationsmanagement-Richtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "CM-2",
|
||||
"title": "Baseline Configuration",
|
||||
"statement": "Basiskonfigurationen fuer Systeme muessen definiert, dokumentiert und gepflegt werden.",
|
||||
"keywords": ["baseline", "basis", "standard"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Basiskonfiguration",
|
||||
"object_class": "configuration"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "CM-6",
|
||||
"title": "Configuration Settings",
|
||||
"statement": "Sicherheitsrelevante Konfigurationseinstellungen muessen definiert und durchgesetzt werden.",
|
||||
"keywords": ["settings", "einstellungen", "sicherheit"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Sicherheitskonfiguration",
|
||||
"object_class": "configuration"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "CM-7",
|
||||
"title": "Least Functionality",
|
||||
"statement": "Systeme muessen so konfiguriert werden, dass nur notwendige Funktionen aktiv sind.",
|
||||
"keywords": ["least functionality", "minimal", "dienste", "ports"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Minimalkonfiguration",
|
||||
"object_class": "configuration"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "CM-8",
|
||||
"title": "System Component Inventory",
|
||||
"statement": "Ein Inventar aller Systemkomponenten muss gefuehrt und aktuell gehalten werden.",
|
||||
"keywords": ["inventar", "inventory", "komponenten", "assets"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Systemkomponenten-Inventar",
|
||||
"object_class": "register"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "IA",
|
||||
"title": "Identification and Authentication",
|
||||
"aliases": ["identification", "authentication", "identifikation", "authentifizierung"],
|
||||
"keywords": ["authentifizierung", "identifikation", "identity", "passwort", "mfa", "credential"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "IA-1",
|
||||
"title": "Policy and Procedures",
|
||||
"statement": "Identifikations- und Authentifizierungsrichtlinien muessen dokumentiert und regelmaessig ueberprueft werden.",
|
||||
"keywords": ["policy", "richtlinie"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Authentifizierungsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IA-2",
|
||||
"title": "Identification and Authentication",
|
||||
"statement": "Benutzer und Geraete muessen eindeutig identifiziert und authentifiziert werden.",
|
||||
"keywords": ["benutzer", "geraete", "identifizierung"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Benutzerauthentifizierung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IA-2(1)",
|
||||
"title": "Multi-Factor Authentication",
|
||||
"statement": "Multi-Faktor-Authentifizierung muss fuer privilegierte Konten implementiert werden.",
|
||||
"keywords": ["mfa", "multi-faktor", "zwei-faktor", "2fa"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Multi-Faktor-Authentifizierung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IA-5",
|
||||
"title": "Authenticator Management",
|
||||
"statement": "Authentifizierungsmittel (Passwoerter, Token, Zertifikate) muessen sicher verwaltet werden.",
|
||||
"keywords": ["passwort", "token", "zertifikat", "credential"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Authentifizierungsmittel-Verwaltung",
|
||||
"object_class": "technical_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "IR",
|
||||
"title": "Incident Response",
|
||||
"aliases": ["incident response", "vorfallbehandlung", "vorfallreaktion", "incident management"],
|
||||
"keywords": ["vorfall", "incident", "reaktion", "response", "breach", "sicherheitsvorfall"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "IR-1",
|
||||
"title": "Policy and Procedures",
|
||||
"statement": "Vorfallreaktionsrichtlinien und -verfahren muessen definiert und regelmaessig aktualisiert werden.",
|
||||
"keywords": ["policy", "richtlinie", "verfahren"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Vorfallreaktionsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IR-2",
|
||||
"title": "Incident Response Training",
|
||||
"statement": "Mitarbeiter muessen regelmaessig in der Vorfallreaktion geschult werden.",
|
||||
"keywords": ["training", "schulung"],
|
||||
"action_hint": "train",
|
||||
"object_hint": "Vorfallreaktionsschulung",
|
||||
"object_class": "training"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IR-4",
|
||||
"title": "Incident Handling",
|
||||
"statement": "Ein strukturierter Prozess fuer die Vorfallbehandlung muss implementiert werden: Erkennung, Analyse, Eindaemmung, Behebung.",
|
||||
"keywords": ["handling", "behandlung", "erkennung", "eindaemmung"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Vorfallbehandlungsprozess",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IR-5",
|
||||
"title": "Incident Monitoring",
|
||||
"statement": "Sicherheitsvorfaelle muessen kontinuierlich ueberwacht und verfolgt werden.",
|
||||
"keywords": ["monitoring", "ueberwachung", "tracking"],
|
||||
"action_hint": "monitor",
|
||||
"object_hint": "Vorfallsueberwachung",
|
||||
"object_class": "incident"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IR-6",
|
||||
"title": "Incident Reporting",
|
||||
"statement": "Sicherheitsvorfaelle muessen innerhalb definierter Fristen an die zustaendigen Stellen gemeldet werden.",
|
||||
"keywords": ["reporting", "meldung", "melden", "frist"],
|
||||
"action_hint": "report",
|
||||
"object_hint": "Vorfallmeldung",
|
||||
"object_class": "incident"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "IR-8",
|
||||
"title": "Incident Response Plan",
|
||||
"statement": "Ein Vorfallreaktionsplan muss dokumentiert und regelmaessig getestet werden.",
|
||||
"keywords": ["plan", "dokumentation", "test"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Vorfallreaktionsplan",
|
||||
"object_class": "policy"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "RA",
|
||||
"title": "Risk Assessment",
|
||||
"aliases": ["risk assessment", "risikobewertung", "risikoanalyse"],
|
||||
"keywords": ["risiko", "risk", "bewertung", "assessment", "analyse", "bedrohung", "threat"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "RA-1",
|
||||
"title": "Policy and Procedures",
|
||||
"statement": "Risikobewertungsrichtlinien muessen dokumentiert und regelmaessig aktualisiert werden.",
|
||||
"keywords": ["policy", "richtlinie"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Risikobewertungsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "RA-3",
|
||||
"title": "Risk Assessment",
|
||||
"statement": "Regelmaessige Risikobewertungen muessen durchgefuehrt und dokumentiert werden.",
|
||||
"keywords": ["bewertung", "assessment", "regelmaessig"],
|
||||
"action_hint": "assess",
|
||||
"object_hint": "Risikobewertung",
|
||||
"object_class": "risk_artifact"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "RA-5",
|
||||
"title": "Vulnerability Monitoring and Scanning",
|
||||
"statement": "Systeme muessen regelmaessig auf Schwachstellen gescannt und ueberwacht werden.",
|
||||
"keywords": ["vulnerability", "schwachstelle", "scan", "monitoring"],
|
||||
"action_hint": "monitor",
|
||||
"object_hint": "Schwachstellenueberwachung",
|
||||
"object_class": "system"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "SC",
|
||||
"title": "System and Communications Protection",
|
||||
"aliases": ["system protection", "communications protection", "kommunikationsschutz", "systemschutz"],
|
||||
"keywords": ["verschluesselung", "encryption", "tls", "netzwerk", "network", "kommunikation", "firewall"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "SC-1",
|
||||
"title": "Policy and Procedures",
|
||||
"statement": "System- und Kommunikationsschutzrichtlinien muessen dokumentiert und aktuell gehalten werden.",
|
||||
"keywords": ["policy", "richtlinie"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Kommunikationsschutzrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SC-7",
|
||||
"title": "Boundary Protection",
|
||||
"statement": "Netzwerkgrenzen muessen durch Firewall-Regeln und Zugangskontrollen geschuetzt werden.",
|
||||
"keywords": ["boundary", "grenze", "firewall", "netzwerk"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Netzwerkgrenzschutz",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SC-8",
|
||||
"title": "Transmission Confidentiality and Integrity",
|
||||
"statement": "Daten muessen bei der Uebertragung durch Verschluesselung geschuetzt werden.",
|
||||
"keywords": ["transmission", "uebertragung", "verschluesselung", "tls"],
|
||||
"action_hint": "encrypt",
|
||||
"object_hint": "Uebertragungsverschluesselung",
|
||||
"object_class": "cryptographic_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SC-12",
|
||||
"title": "Cryptographic Key Establishment and Management",
|
||||
"statement": "Kryptographische Schluessel muessen sicher erzeugt, verteilt, gespeichert und widerrufen werden.",
|
||||
"keywords": ["key", "schluessel", "kryptographie", "management"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Schluesselverwaltung",
|
||||
"object_class": "cryptographic_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SC-13",
|
||||
"title": "Cryptographic Protection",
|
||||
"statement": "Kryptographische Mechanismen muessen gemaess anerkannten Standards implementiert werden.",
|
||||
"keywords": ["kryptographie", "verschluesselung", "standard"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Kryptographischer Schutz",
|
||||
"object_class": "cryptographic_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "SI",
|
||||
"title": "System and Information Integrity",
|
||||
"aliases": ["system integrity", "information integrity", "systemintegritaet", "informationsintegritaet"],
|
||||
"keywords": ["integritaet", "integrity", "malware", "patch", "flaw", "schwachstelle"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "SI-1",
|
||||
"title": "Policy and Procedures",
|
||||
"statement": "System- und Informationsintegritaetsrichtlinien muessen dokumentiert und regelmaessig ueberprueft werden.",
|
||||
"keywords": ["policy", "richtlinie"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Integritaetsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SI-2",
|
||||
"title": "Flaw Remediation",
|
||||
"statement": "Bekannte Schwachstellen muessen innerhalb definierter Fristen behoben werden.",
|
||||
"keywords": ["flaw", "schwachstelle", "patch", "behebung", "remediation"],
|
||||
"action_hint": "remediate",
|
||||
"object_hint": "Schwachstellenbehebung",
|
||||
"object_class": "system"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SI-3",
|
||||
"title": "Malicious Code Protection",
|
||||
"statement": "Systeme muessen vor Schadsoftware geschuetzt werden durch Erkennung und Abwehrmechanismen.",
|
||||
"keywords": ["malware", "schadsoftware", "antivirus", "erkennung"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Schadsoftwareschutz",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SI-4",
|
||||
"title": "System Monitoring",
|
||||
"statement": "Systeme muessen kontinuierlich auf Sicherheitsereignisse und Anomalien ueberwacht werden.",
|
||||
"keywords": ["monitoring", "ueberwachung", "anomalie", "siem"],
|
||||
"action_hint": "monitor",
|
||||
"object_hint": "Systemueberwachung",
|
||||
"object_class": "system"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SI-5",
|
||||
"title": "Security Alerts and Advisories",
|
||||
"statement": "Sicherheitswarnungen muessen empfangen, bewertet und darauf reagiert werden.",
|
||||
"keywords": ["alert", "warnung", "advisory", "cve"],
|
||||
"action_hint": "monitor",
|
||||
"object_hint": "Sicherheitswarnungen",
|
||||
"object_class": "incident"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "SA",
|
||||
"title": "System and Services Acquisition",
|
||||
"aliases": ["system acquisition", "services acquisition", "systembeschaffung", "secure development"],
|
||||
"keywords": ["beschaffung", "acquisition", "entwicklung", "development", "lieferkette", "supply chain"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "SA-1",
|
||||
"title": "Policy and Procedures",
|
||||
"statement": "Beschaffungsrichtlinien mit Sicherheitsanforderungen muessen dokumentiert werden.",
|
||||
"keywords": ["policy", "richtlinie", "beschaffung"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Beschaffungsrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SA-8",
|
||||
"title": "Security and Privacy Engineering Principles",
|
||||
"statement": "Sicherheits- und Datenschutzprinzipien muessen in die Systementwicklung integriert werden.",
|
||||
"keywords": ["engineering", "development", "prinzipien", "design"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Security-by-Design-Prinzipien",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SA-11",
|
||||
"title": "Developer Testing and Evaluation",
|
||||
"statement": "Entwickler muessen Sicherheitstests und Code-Reviews durchfuehren.",
|
||||
"keywords": ["testing", "test", "code review", "evaluation"],
|
||||
"action_hint": "test",
|
||||
"object_hint": "Entwickler-Sicherheitstests",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "SA-12",
|
||||
"title": "Supply Chain Protection",
|
||||
"statement": "Lieferkettenrisiken muessen bewertet und Schutzmassnahmen implementiert werden.",
|
||||
"keywords": ["supply chain", "lieferkette", "third party", "drittanbieter"],
|
||||
"action_hint": "assess",
|
||||
"object_hint": "Lieferkettenrisikobewertung",
|
||||
"object_class": "risk_artifact"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
353
backend-compliance/compliance/data/frameworks/owasp_asvs.json
Normal file
353
backend-compliance/compliance/data/frameworks/owasp_asvs.json
Normal file
@@ -0,0 +1,353 @@
|
||||
{
|
||||
"framework_id": "OWASP_ASVS",
|
||||
"display_name": "OWASP Application Security Verification Standard 4.0",
|
||||
"license": {
|
||||
"type": "cc_by_sa_4",
|
||||
"rag_allowed": true,
|
||||
"use_as_metadata": true
|
||||
},
|
||||
"domains": [
|
||||
{
|
||||
"domain_id": "V1",
|
||||
"title": "Architecture, Design and Threat Modeling",
|
||||
"aliases": ["architecture", "architektur", "design", "threat modeling", "bedrohungsmodellierung"],
|
||||
"keywords": ["architektur", "design", "threat model", "bedrohung", "modellierung"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V1.1",
|
||||
"title": "Secure Software Development Lifecycle",
|
||||
"statement": "Ein sicherer Softwareentwicklungs-Lebenszyklus (SSDLC) muss definiert und angewendet werden.",
|
||||
"keywords": ["sdlc", "lifecycle", "lebenszyklus", "entwicklung"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Sicherer Entwicklungs-Lebenszyklus",
|
||||
"object_class": "process"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V1.2",
|
||||
"title": "Authentication Architecture",
|
||||
"statement": "Die Authentifizierungsarchitektur muss dokumentiert und regelmaessig ueberprueft werden.",
|
||||
"keywords": ["authentication", "authentifizierung", "architektur"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Authentifizierungsarchitektur",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V1.4",
|
||||
"title": "Access Control Architecture",
|
||||
"statement": "Die Zugriffskontrollarchitektur muss dokumentiert und zentral durchgesetzt werden.",
|
||||
"keywords": ["access control", "zugriffskontrolle", "architektur"],
|
||||
"action_hint": "document",
|
||||
"object_hint": "Zugriffskontrollarchitektur",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V1.5",
|
||||
"title": "Input and Output Architecture",
|
||||
"statement": "Eingabe- und Ausgabevalidierung muss architektonisch verankert und durchgaengig angewendet werden.",
|
||||
"keywords": ["input", "output", "eingabe", "ausgabe", "validierung"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Ein-/Ausgabevalidierung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V1.6",
|
||||
"title": "Cryptographic Architecture",
|
||||
"statement": "Kryptographische Mechanismen muessen architektonisch definiert und standardisiert sein.",
|
||||
"keywords": ["crypto", "kryptographie", "verschluesselung"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Kryptographie-Architektur",
|
||||
"object_class": "cryptographic_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "V2",
|
||||
"title": "Authentication",
|
||||
"aliases": ["authentication", "authentifizierung", "anmeldung", "login"],
|
||||
"keywords": ["authentication", "authentifizierung", "passwort", "login", "anmeldung", "credential"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V2.1",
|
||||
"title": "Password Security",
|
||||
"statement": "Passwortrichtlinien muessen Mindestlaenge, Komplexitaet und Sperrmechanismen definieren.",
|
||||
"keywords": ["passwort", "password", "laenge", "komplexitaet"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Passwortrichtlinie",
|
||||
"object_class": "policy"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V2.2",
|
||||
"title": "General Authenticator Security",
|
||||
"statement": "Authentifizierungsmittel muessen sicher gespeichert und uebertragen werden.",
|
||||
"keywords": ["authenticator", "credential", "speicherung"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Sichere Credential-Verwaltung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V2.7",
|
||||
"title": "Out-of-Band Verification",
|
||||
"statement": "Out-of-Band-Verifikationsmechanismen muessen sicher implementiert werden.",
|
||||
"keywords": ["oob", "out-of-band", "sms", "push"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Out-of-Band-Verifikation",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V2.8",
|
||||
"title": "Multi-Factor Authentication",
|
||||
"statement": "Multi-Faktor-Authentifizierung muss fuer sicherheitskritische Funktionen verfuegbar sein.",
|
||||
"keywords": ["mfa", "multi-faktor", "totp", "fido"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Multi-Faktor-Authentifizierung",
|
||||
"object_class": "technical_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "V3",
|
||||
"title": "Session Management",
|
||||
"aliases": ["session", "sitzung", "session management", "sitzungsverwaltung"],
|
||||
"keywords": ["session", "sitzung", "token", "cookie", "timeout"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V3.1",
|
||||
"title": "Session Management Security",
|
||||
"statement": "Sitzungstoken muessen sicher erzeugt, uebertragen und invalidiert werden.",
|
||||
"keywords": ["token", "sitzung", "sicherheit"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Sichere Sitzungsverwaltung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V3.3",
|
||||
"title": "Session Termination",
|
||||
"statement": "Sitzungen muessen nach Inaktivitaet und bei Abmeldung zuverlaessig beendet werden.",
|
||||
"keywords": ["termination", "timeout", "abmeldung", "beenden"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Sitzungstimeout",
|
||||
"object_class": "configuration"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V3.5",
|
||||
"title": "Token-Based Session Management",
|
||||
"statement": "Tokenbasierte Sitzungsmechanismen muessen gegen Diebstahl und Replay geschuetzt sein.",
|
||||
"keywords": ["jwt", "token", "replay", "diebstahl"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Token-Schutz",
|
||||
"object_class": "technical_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "V5",
|
||||
"title": "Validation, Sanitization and Encoding",
|
||||
"aliases": ["validation", "validierung", "sanitization", "encoding", "eingabevalidierung"],
|
||||
"keywords": ["validierung", "sanitization", "encoding", "xss", "injection", "eingabe"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V5.1",
|
||||
"title": "Input Validation",
|
||||
"statement": "Alle Eingabedaten muessen serverseitig validiert werden.",
|
||||
"keywords": ["input", "eingabe", "validierung", "serverseitig"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Eingabevalidierung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V5.2",
|
||||
"title": "Sanitization and Sandboxing",
|
||||
"statement": "Eingaben muessen bereinigt und in sicherer Umgebung verarbeitet werden.",
|
||||
"keywords": ["sanitization", "bereinigung", "sandbox"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Eingabebereinigung",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V5.3",
|
||||
"title": "Output Encoding and Injection Prevention",
|
||||
"statement": "Ausgaben muessen kontextabhaengig kodiert werden, um Injection-Angriffe zu verhindern.",
|
||||
"keywords": ["output", "encoding", "injection", "xss", "sql"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Ausgabe-Encoding",
|
||||
"object_class": "technical_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "V6",
|
||||
"title": "Stored Cryptography",
|
||||
"aliases": ["cryptography", "kryptographie", "verschluesselung", "stored cryptography"],
|
||||
"keywords": ["kryptographie", "verschluesselung", "hashing", "schluessel", "key management"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V6.1",
|
||||
"title": "Data Classification",
|
||||
"statement": "Daten muessen klassifiziert und entsprechend ihrer Schutzklasse behandelt werden.",
|
||||
"keywords": ["klassifizierung", "classification", "schutzklasse"],
|
||||
"action_hint": "define",
|
||||
"object_hint": "Datenklassifizierung",
|
||||
"object_class": "data"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V6.2",
|
||||
"title": "Algorithms",
|
||||
"statement": "Nur zugelassene und aktuelle kryptographische Algorithmen duerfen verwendet werden.",
|
||||
"keywords": ["algorithmus", "algorithm", "aes", "rsa"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Kryptographische Algorithmen",
|
||||
"object_class": "cryptographic_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V6.4",
|
||||
"title": "Secret Management",
|
||||
"statement": "Geheimnisse (Schluessel, Passwoerter, Tokens) muessen in einem Secret-Management-System verwaltet werden.",
|
||||
"keywords": ["secret", "geheimnis", "vault", "key management"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Secret-Management",
|
||||
"object_class": "cryptographic_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "V8",
|
||||
"title": "Data Protection",
|
||||
"aliases": ["data protection", "datenschutz", "datenverarbeitung"],
|
||||
"keywords": ["datenschutz", "data protection", "pii", "personenbezogen", "privacy"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V8.1",
|
||||
"title": "General Data Protection",
|
||||
"statement": "Personenbezogene Daten muessen gemaess Datenschutzanforderungen geschuetzt werden.",
|
||||
"keywords": ["personenbezogen", "pii", "datenschutz"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Datenschutzmassnahmen",
|
||||
"object_class": "data"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V8.2",
|
||||
"title": "Client-Side Data Protection",
|
||||
"statement": "Clientseitig gespeicherte sensible Daten muessen geschuetzt und minimiert werden.",
|
||||
"keywords": ["client", "browser", "localstorage", "cookie"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Clientseitiger Datenschutz",
|
||||
"object_class": "technical_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V8.3",
|
||||
"title": "Sensitive Private Data",
|
||||
"statement": "Sensible Daten muessen bei Speicherung und Verarbeitung besonders geschuetzt werden.",
|
||||
"keywords": ["sensibel", "vertraulich", "speicherung"],
|
||||
"action_hint": "encrypt",
|
||||
"object_hint": "Verschluesselung sensibler Daten",
|
||||
"object_class": "data"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "V9",
|
||||
"title": "Communication",
|
||||
"aliases": ["communication", "kommunikation", "tls", "transport"],
|
||||
"keywords": ["tls", "ssl", "https", "transport", "kommunikation", "verschluesselung"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V9.1",
|
||||
"title": "Client Communication Security",
|
||||
"statement": "Alle Client-Server-Kommunikation muss ueber TLS verschluesselt werden.",
|
||||
"keywords": ["tls", "https", "client", "server"],
|
||||
"action_hint": "encrypt",
|
||||
"object_hint": "TLS-Transportverschluesselung",
|
||||
"object_class": "cryptographic_control"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V9.2",
|
||||
"title": "Server Communication Security",
|
||||
"statement": "Server-zu-Server-Kommunikation muss authentifiziert und verschluesselt erfolgen.",
|
||||
"keywords": ["server", "mtls", "backend"],
|
||||
"action_hint": "encrypt",
|
||||
"object_hint": "Server-Kommunikationsverschluesselung",
|
||||
"object_class": "cryptographic_control"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "V13",
|
||||
"title": "API and Web Service",
|
||||
"aliases": ["api", "web service", "rest", "graphql", "webservice"],
|
||||
"keywords": ["api", "rest", "graphql", "webservice", "endpoint", "schnittstelle"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V13.1",
|
||||
"title": "Generic Web Service Security",
|
||||
"statement": "Web-Services muessen gegen gaengige Angriffe abgesichert werden.",
|
||||
"keywords": ["web service", "sicherheit", "angriff"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "Web-Service-Absicherung",
|
||||
"object_class": "interface"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V13.2",
|
||||
"title": "RESTful Web Service",
|
||||
"statement": "REST-APIs muessen Input-Validierung, Rate Limiting und sichere Authentifizierung implementieren.",
|
||||
"keywords": ["rest", "api", "rate limiting", "input"],
|
||||
"action_hint": "implement",
|
||||
"object_hint": "REST-API-Absicherung",
|
||||
"object_class": "interface"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V13.4",
|
||||
"title": "GraphQL and Web Services",
|
||||
"statement": "GraphQL-Endpoints muessen gegen Query-Complexity-Angriffe und Introspection geschuetzt werden.",
|
||||
"keywords": ["graphql", "query", "complexity", "introspection"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "GraphQL-Absicherung",
|
||||
"object_class": "interface"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"domain_id": "V14",
|
||||
"title": "Configuration",
|
||||
"aliases": ["configuration", "konfiguration", "hardening", "haertung"],
|
||||
"keywords": ["konfiguration", "hardening", "haertung", "header", "deployment"],
|
||||
"subcontrols": [
|
||||
{
|
||||
"subcontrol_id": "V14.1",
|
||||
"title": "Build and Deploy",
|
||||
"statement": "Build- und Deployment-Prozesse muessen sicher konfiguriert und reproduzierbar sein.",
|
||||
"keywords": ["build", "deploy", "ci/cd", "pipeline"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Sichere Build-Pipeline",
|
||||
"object_class": "configuration"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V14.2",
|
||||
"title": "Dependency Management",
|
||||
"statement": "Abhaengigkeiten muessen auf Schwachstellen geprueft und aktuell gehalten werden.",
|
||||
"keywords": ["dependency", "abhaengigkeit", "sca", "sbom"],
|
||||
"action_hint": "maintain",
|
||||
"object_hint": "Abhaengigkeitsverwaltung",
|
||||
"object_class": "system"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V14.3",
|
||||
"title": "Unintended Security Disclosure",
|
||||
"statement": "Fehlermeldungen und Debug-Informationen duerfen keine sicherheitsrelevanten Details preisgeben.",
|
||||
"keywords": ["disclosure", "fehlermeldung", "debug", "information leakage"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "Fehlerbehandlung",
|
||||
"object_class": "configuration"
|
||||
},
|
||||
{
|
||||
"subcontrol_id": "V14.4",
|
||||
"title": "HTTP Security Headers",
|
||||
"statement": "HTTP-Sicherheitsheader muessen korrekt konfiguriert sein.",
|
||||
"keywords": ["header", "csp", "hsts", "x-frame"],
|
||||
"action_hint": "configure",
|
||||
"object_hint": "HTTP-Sicherheitsheader",
|
||||
"object_class": "configuration"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
205
backend-compliance/compliance/data/source_type_classification.py
Normal file
205
backend-compliance/compliance/data/source_type_classification.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
Source-Type-Klassifikation fuer Regulierungen und Frameworks.
|
||||
|
||||
Dreistufiges Modell der normativen Verbindlichkeit:
|
||||
|
||||
Stufe 1 — GESETZ (law):
|
||||
Rechtlich bindend. Bussgeld bei Verstoss.
|
||||
Beispiele: DSGVO, NIS2, AI Act, CRA
|
||||
|
||||
Stufe 2 — LEITLINIE (guideline):
|
||||
Offizielle Auslegungshilfe von Aufsichtsbehoerden.
|
||||
Beweislastumkehr: Wer abweicht, muss begruenden warum.
|
||||
Beispiele: EDPB-Leitlinien, BSI-Standards, WP29-Dokumente
|
||||
|
||||
Stufe 3 — FRAMEWORK (framework):
|
||||
Freiwillige Best Practices, nicht rechtsverbindlich.
|
||||
Aber: Koennen als "Stand der Technik" herangezogen werden.
|
||||
Beispiele: ENISA, NIST, OWASP, OECD, CISA
|
||||
|
||||
Mapping: source_regulation (aus control_parent_links) -> source_type
|
||||
"""
|
||||
|
||||
# --- Typ-Definitionen ---
|
||||
SOURCE_TYPE_LAW = "law" # Gesetz/Verordnung/Richtlinie — normative_strength bleibt
|
||||
SOURCE_TYPE_GUIDELINE = "guideline" # Leitlinie/Standard — max "should"
|
||||
SOURCE_TYPE_FRAMEWORK = "framework" # Framework/Best Practice — max "may"
|
||||
|
||||
# Max erlaubte normative_strength pro source_type
|
||||
# DB-Constraint erlaubt: must, should, may (NICHT "can")
|
||||
NORMATIVE_STRENGTH_CAP: dict[str, str] = {
|
||||
SOURCE_TYPE_LAW: "must", # keine Begrenzung
|
||||
SOURCE_TYPE_GUIDELINE: "should", # max "should"
|
||||
SOURCE_TYPE_FRAMEWORK: "may", # max "may" (= "kann")
|
||||
}
|
||||
|
||||
# Reihenfolge fuer Vergleiche (hoeher = staerker)
|
||||
STRENGTH_ORDER: dict[str, int] = {
|
||||
"may": 1, # KANN (DB-Wert)
|
||||
"can": 1, # Alias — wird in cap_normative_strength zu "may" normalisiert
|
||||
"should": 2,
|
||||
"must": 3,
|
||||
}
|
||||
|
||||
|
||||
def cap_normative_strength(original: str, source_type: str) -> str:
|
||||
"""
|
||||
Begrenzt die normative_strength basierend auf dem source_type.
|
||||
|
||||
Beispiel:
|
||||
cap_normative_strength("must", "framework") -> "may"
|
||||
cap_normative_strength("should", "law") -> "should"
|
||||
cap_normative_strength("must", "guideline") -> "should"
|
||||
"""
|
||||
cap = NORMATIVE_STRENGTH_CAP.get(source_type, "must")
|
||||
cap_level = STRENGTH_ORDER.get(cap, 3)
|
||||
original_level = STRENGTH_ORDER.get(original, 3)
|
||||
if original_level > cap_level:
|
||||
return cap
|
||||
return original
|
||||
|
||||
|
||||
def get_highest_source_type(source_types: list[str]) -> str:
|
||||
"""
|
||||
Bestimmt den hoechsten source_type aus einer Liste.
|
||||
Ein Gesetz uebertrumpft alles.
|
||||
|
||||
Beispiel:
|
||||
get_highest_source_type(["framework", "law"]) -> "law"
|
||||
get_highest_source_type(["framework", "guideline"]) -> "guideline"
|
||||
"""
|
||||
type_order = {SOURCE_TYPE_FRAMEWORK: 1, SOURCE_TYPE_GUIDELINE: 2, SOURCE_TYPE_LAW: 3}
|
||||
if not source_types:
|
||||
return SOURCE_TYPE_FRAMEWORK
|
||||
return max(source_types, key=lambda t: type_order.get(t, 0))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Klassifikation: source_regulation -> source_type
|
||||
#
|
||||
# Diese Map wird fuer den Backfill und zukuenftige Pipeline-Runs verwendet.
|
||||
# Neue Regulierungen hier eintragen!
|
||||
# ============================================================================
|
||||
|
||||
SOURCE_REGULATION_CLASSIFICATION: dict[str, str] = {
|
||||
# --- EU-Verordnungen (unmittelbar bindend) ---
|
||||
"DSGVO (EU) 2016/679": SOURCE_TYPE_LAW,
|
||||
"KI-Verordnung (EU) 2024/1689": SOURCE_TYPE_LAW,
|
||||
"Cyber Resilience Act (CRA)": SOURCE_TYPE_LAW,
|
||||
"NIS2-Richtlinie (EU) 2022/2555": SOURCE_TYPE_LAW,
|
||||
"Data Act": SOURCE_TYPE_LAW,
|
||||
"Data Governance Act (DGA)": SOURCE_TYPE_LAW,
|
||||
"Markets in Crypto-Assets (MiCA)": SOURCE_TYPE_LAW,
|
||||
"Maschinenverordnung (EU) 2023/1230": SOURCE_TYPE_LAW,
|
||||
"Batterieverordnung (EU) 2023/1542": SOURCE_TYPE_LAW,
|
||||
"AML-Verordnung": SOURCE_TYPE_LAW,
|
||||
|
||||
# --- EU-Richtlinien (nach nationaler Umsetzung bindend) ---
|
||||
# Fuer Compliance-Zwecke wie Gesetze behandeln
|
||||
|
||||
# --- Nationale Gesetze ---
|
||||
"Bundesdatenschutzgesetz (BDSG)": SOURCE_TYPE_LAW,
|
||||
"Telekommunikationsgesetz": SOURCE_TYPE_LAW,
|
||||
"Telekommunikationsgesetz Oesterreich": SOURCE_TYPE_LAW,
|
||||
"Gewerbeordnung (GewO)": SOURCE_TYPE_LAW,
|
||||
"Handelsgesetzbuch (HGB)": SOURCE_TYPE_LAW,
|
||||
"Abgabenordnung (AO)": SOURCE_TYPE_LAW,
|
||||
"IFRS-Übernahmeverordnung": SOURCE_TYPE_LAW,
|
||||
"Österreichisches Datenschutzgesetz (DSG)": SOURCE_TYPE_LAW,
|
||||
"LOPDGDD - Ley Orgánica de Protección de Datos (Spanien)": SOURCE_TYPE_LAW,
|
||||
"Loi Informatique et Libertés (Frankreich)": SOURCE_TYPE_LAW,
|
||||
"Információs önrendelkezési jog törvény (Ungarn)": SOURCE_TYPE_LAW,
|
||||
"EU Blue Guide 2022": SOURCE_TYPE_LAW,
|
||||
|
||||
# --- EDPB/WP29 Leitlinien (offizielle Auslegungshilfe) ---
|
||||
"EDPB Leitlinien 01/2019 (Zertifizierung)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 01/2020 (Datentransfers)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 01/2020 (Vernetzte Fahrzeuge)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 01/2022 (BCR)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 01/2024 (Berechtigtes Interesse)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 04/2019 (Data Protection by Design)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 05/2020 - Einwilligung": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 07/2020 (Datentransfers)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 08/2020 (Social Media)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 09/2022 (Data Breach)": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien 09/2022 - Meldung von Datenschutzverletzungen": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Empfehlungen 01/2020 - Ergaenzende Massnahmen fuer Datentransfers": SOURCE_TYPE_GUIDELINE,
|
||||
"EDPB Leitlinien - Berechtigtes Interesse (Art. 6(1)(f))": SOURCE_TYPE_GUIDELINE,
|
||||
"WP244 Leitlinien (Profiling)": SOURCE_TYPE_GUIDELINE,
|
||||
"WP251 Leitlinien (Profiling)": SOURCE_TYPE_GUIDELINE,
|
||||
"WP260 Leitlinien (Transparenz)": SOURCE_TYPE_GUIDELINE,
|
||||
|
||||
# --- BSI Standards (behoerdliche technische Richtlinien) ---
|
||||
"BSI-TR-03161-1": SOURCE_TYPE_GUIDELINE,
|
||||
"BSI-TR-03161-2": SOURCE_TYPE_GUIDELINE,
|
||||
"BSI-TR-03161-3": SOURCE_TYPE_GUIDELINE,
|
||||
|
||||
# --- ENISA (EU-Agentur, aber Empfehlungen nicht rechtsverbindlich) ---
|
||||
"ENISA Cybersecurity State 2024": SOURCE_TYPE_FRAMEWORK,
|
||||
"ENISA ICS/SCADA Dependencies": SOURCE_TYPE_FRAMEWORK,
|
||||
"ENISA Supply Chain Good Practices": SOURCE_TYPE_FRAMEWORK,
|
||||
"ENISA Threat Landscape Supply Chain": SOURCE_TYPE_FRAMEWORK,
|
||||
|
||||
# --- NIST (US-Standards, international als Best Practice) ---
|
||||
"NIST AI Risk Management Framework": SOURCE_TYPE_FRAMEWORK,
|
||||
"NIST Cybersecurity Framework 2.0": SOURCE_TYPE_FRAMEWORK,
|
||||
"NIST SP 800-207 (Zero Trust)": SOURCE_TYPE_FRAMEWORK,
|
||||
"NIST SP 800-218 (SSDF)": SOURCE_TYPE_FRAMEWORK,
|
||||
"NIST SP 800-53 Rev. 5": SOURCE_TYPE_FRAMEWORK,
|
||||
"NIST SP 800-63-3": SOURCE_TYPE_FRAMEWORK,
|
||||
|
||||
# --- OWASP (Community-Standards) ---
|
||||
"OWASP API Security Top 10 (2023)": SOURCE_TYPE_FRAMEWORK,
|
||||
"OWASP ASVS 4.0": SOURCE_TYPE_FRAMEWORK,
|
||||
"OWASP MASVS 2.0": SOURCE_TYPE_FRAMEWORK,
|
||||
"OWASP SAMM 2.0": SOURCE_TYPE_FRAMEWORK,
|
||||
"OWASP Top 10 (2021)": SOURCE_TYPE_FRAMEWORK,
|
||||
|
||||
# --- Sonstige Frameworks ---
|
||||
"OECD KI-Empfehlung": SOURCE_TYPE_FRAMEWORK,
|
||||
"CISA Secure by Design": SOURCE_TYPE_FRAMEWORK,
|
||||
}
|
||||
|
||||
|
||||
def classify_source_regulation(source_regulation: str) -> str:
|
||||
"""
|
||||
Klassifiziert eine source_regulation als law, guideline oder framework.
|
||||
|
||||
Verwendet exaktes Matching gegen die Map. Bei unbekannten Quellen
|
||||
wird anhand von Schluesselwoertern geraten, Fallback ist 'framework'
|
||||
(konservativstes Ergebnis).
|
||||
"""
|
||||
if not source_regulation:
|
||||
return SOURCE_TYPE_FRAMEWORK
|
||||
|
||||
# Exaktes Match
|
||||
if source_regulation in SOURCE_REGULATION_CLASSIFICATION:
|
||||
return SOURCE_REGULATION_CLASSIFICATION[source_regulation]
|
||||
|
||||
# Heuristik fuer unbekannte Quellen
|
||||
lower = source_regulation.lower()
|
||||
|
||||
# Gesetze erkennen
|
||||
law_indicators = [
|
||||
"verordnung", "richtlinie", "gesetz", "directive", "regulation",
|
||||
"(eu)", "(eg)", "act", "ley", "loi", "törvény", "código",
|
||||
]
|
||||
if any(ind in lower for ind in law_indicators):
|
||||
return SOURCE_TYPE_LAW
|
||||
|
||||
# Leitlinien erkennen
|
||||
guideline_indicators = [
|
||||
"edpb", "leitlinie", "guideline", "wp2", "bsi", "empfehlung",
|
||||
]
|
||||
if any(ind in lower for ind in guideline_indicators):
|
||||
return SOURCE_TYPE_GUIDELINE
|
||||
|
||||
# Frameworks erkennen
|
||||
framework_indicators = [
|
||||
"enisa", "nist", "owasp", "oecd", "cisa", "framework", "iso",
|
||||
]
|
||||
if any(ind in lower for ind in framework_indicators):
|
||||
return SOURCE_TYPE_FRAMEWORK
|
||||
|
||||
# Konservativ: unbekannt = framework (geringste Verbindlichkeit)
|
||||
return SOURCE_TYPE_FRAMEWORK
|
||||
@@ -8,12 +8,16 @@ from .models import (
|
||||
EvidenceDB,
|
||||
RiskDB,
|
||||
AuditExportDB,
|
||||
LLMGenerationAuditDB,
|
||||
AssertionDB,
|
||||
RegulationTypeEnum,
|
||||
ControlTypeEnum,
|
||||
ControlDomainEnum,
|
||||
RiskLevelEnum,
|
||||
EvidenceStatusEnum,
|
||||
ControlStatusEnum,
|
||||
EvidenceConfidenceEnum,
|
||||
EvidenceTruthStatusEnum,
|
||||
)
|
||||
from .repository import (
|
||||
RegulationRepository,
|
||||
@@ -33,6 +37,8 @@ __all__ = [
|
||||
"EvidenceDB",
|
||||
"RiskDB",
|
||||
"AuditExportDB",
|
||||
"LLMGenerationAuditDB",
|
||||
"AssertionDB",
|
||||
# Enums
|
||||
"RegulationTypeEnum",
|
||||
"ControlTypeEnum",
|
||||
@@ -40,6 +46,8 @@ __all__ = [
|
||||
"RiskLevelEnum",
|
||||
"EvidenceStatusEnum",
|
||||
"ControlStatusEnum",
|
||||
"EvidenceConfidenceEnum",
|
||||
"EvidenceTruthStatusEnum",
|
||||
# Repositories
|
||||
"RegulationRepository",
|
||||
"RequirementRepository",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
164
backend-compliance/compliance/db/vvt_library_models.py
Normal file
164
backend-compliance/compliance/db/vvt_library_models.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
SQLAlchemy models for VVT Master Libraries + Process Templates.
|
||||
|
||||
Tables (global, no tenant_id):
|
||||
- vvt_lib_data_subjects
|
||||
- vvt_lib_data_categories (hierarchical, self-referencing)
|
||||
- vvt_lib_recipients
|
||||
- vvt_lib_legal_bases
|
||||
- vvt_lib_retention_rules
|
||||
- vvt_lib_transfer_mechanisms
|
||||
- vvt_lib_purposes
|
||||
- vvt_lib_toms
|
||||
|
||||
Tenant-scoped:
|
||||
- vvt_process_templates (system + tenant-specific)
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import (
|
||||
Column, String, Text, Boolean, Integer, DateTime, JSON, Index,
|
||||
ForeignKey,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
|
||||
from classroom_engine.database import Base
|
||||
|
||||
|
||||
class VVTLibDataSubjectDB(Base):
|
||||
__tablename__ = 'vvt_lib_data_subjects'
|
||||
|
||||
id = Column(String(50), primary_key=True)
|
||||
label_de = Column(String(200), nullable=False)
|
||||
description_de = Column(Text)
|
||||
art9_relevant = Column(Boolean, default=False)
|
||||
typical_for = Column(JSON, default=list)
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
|
||||
|
||||
class VVTLibDataCategoryDB(Base):
|
||||
__tablename__ = 'vvt_lib_data_categories'
|
||||
|
||||
id = Column(String(50), primary_key=True)
|
||||
parent_id = Column(String(50), ForeignKey('vvt_lib_data_categories.id', ondelete='SET NULL'), nullable=True)
|
||||
label_de = Column(String(200), nullable=False)
|
||||
description_de = Column(Text)
|
||||
is_art9 = Column(Boolean, default=False)
|
||||
is_art10 = Column(Boolean, default=False)
|
||||
risk_weight = Column(Integer, default=1)
|
||||
default_retention_rule = Column(String(50))
|
||||
default_legal_basis = Column(String(50))
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
|
||||
|
||||
class VVTLibRecipientDB(Base):
|
||||
__tablename__ = 'vvt_lib_recipients'
|
||||
|
||||
id = Column(String(50), primary_key=True)
|
||||
type = Column(String(20), nullable=False)
|
||||
label_de = Column(String(200), nullable=False)
|
||||
description_de = Column(Text)
|
||||
is_third_country = Column(Boolean, default=False)
|
||||
country = Column(String(5))
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
|
||||
|
||||
class VVTLibLegalBasisDB(Base):
|
||||
__tablename__ = 'vvt_lib_legal_bases'
|
||||
|
||||
id = Column(String(50), primary_key=True)
|
||||
article = Column(String(50), nullable=False)
|
||||
type = Column(String(30), nullable=False)
|
||||
label_de = Column(String(300), nullable=False)
|
||||
description_de = Column(Text)
|
||||
is_art9 = Column(Boolean, default=False)
|
||||
typical_national_law = Column(String(100))
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
|
||||
|
||||
class VVTLibRetentionRuleDB(Base):
|
||||
__tablename__ = 'vvt_lib_retention_rules'
|
||||
|
||||
id = Column(String(50), primary_key=True)
|
||||
label_de = Column(String(300), nullable=False)
|
||||
description_de = Column(Text)
|
||||
legal_basis = Column(String(200))
|
||||
duration = Column(Integer, nullable=False)
|
||||
duration_unit = Column(String(10), nullable=False)
|
||||
start_event = Column(String(200))
|
||||
deletion_procedure = Column(String(500))
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
|
||||
|
||||
class VVTLibTransferMechanismDB(Base):
|
||||
__tablename__ = 'vvt_lib_transfer_mechanisms'
|
||||
|
||||
id = Column(String(50), primary_key=True)
|
||||
label_de = Column(String(300), nullable=False)
|
||||
description_de = Column(Text)
|
||||
article = Column(String(50))
|
||||
requires_tia = Column(Boolean, default=False)
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
|
||||
|
||||
class VVTLibPurposeDB(Base):
|
||||
__tablename__ = 'vvt_lib_purposes'
|
||||
|
||||
id = Column(String(50), primary_key=True)
|
||||
label_de = Column(String(300), nullable=False)
|
||||
description_de = Column(Text)
|
||||
typical_legal_basis = Column(String(50))
|
||||
typical_for = Column(JSON, default=list)
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
|
||||
|
||||
class VVTLibTomDB(Base):
|
||||
__tablename__ = 'vvt_lib_toms'
|
||||
|
||||
id = Column(String(50), primary_key=True)
|
||||
category = Column(String(30), nullable=False)
|
||||
label_de = Column(String(300), nullable=False)
|
||||
description_de = Column(Text)
|
||||
art32_reference = Column(String(100))
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
|
||||
|
||||
class VVTProcessTemplateDB(Base):
|
||||
__tablename__ = 'vvt_process_templates'
|
||||
|
||||
id = Column(String(80), primary_key=True)
|
||||
name = Column(String(300), nullable=False)
|
||||
description = Column(Text)
|
||||
business_function = Column(String(50))
|
||||
purpose_refs = Column(JSON, default=list)
|
||||
legal_basis_refs = Column(JSON, default=list)
|
||||
data_subject_refs = Column(JSON, default=list)
|
||||
data_category_refs = Column(JSON, default=list)
|
||||
recipient_refs = Column(JSON, default=list)
|
||||
tom_refs = Column(JSON, default=list)
|
||||
transfer_mechanism_refs = Column(JSON, default=list)
|
||||
retention_rule_ref = Column(String(50))
|
||||
typical_systems = Column(JSON, default=list)
|
||||
protection_level = Column(String(10), default='MEDIUM')
|
||||
dpia_required = Column(Boolean, default=False)
|
||||
risk_score = Column(Integer)
|
||||
tags = Column(JSON, default=list)
|
||||
is_system = Column(Boolean, default=True)
|
||||
tenant_id = Column(UUID(as_uuid=True), nullable=True)
|
||||
sort_order = Column(Integer, default=0)
|
||||
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
|
||||
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
__table_args__ = (
|
||||
Index('idx_vvt_process_templates_bf', 'business_function'),
|
||||
Index('idx_vvt_process_templates_system', 'is_system'),
|
||||
)
|
||||
@@ -79,6 +79,26 @@ class VVTActivityDB(Base):
|
||||
next_review_at = Column(DateTime(timezone=True), nullable=True)
|
||||
created_by = Column(String(200), default='system')
|
||||
dsfa_id = Column(UUID(as_uuid=True), nullable=True)
|
||||
|
||||
# Library refs (Phase 1 — parallel to freetext fields)
|
||||
purpose_refs = Column(JSON, nullable=True)
|
||||
legal_basis_refs = Column(JSON, nullable=True)
|
||||
data_subject_refs = Column(JSON, nullable=True)
|
||||
data_category_refs = Column(JSON, nullable=True)
|
||||
recipient_refs = Column(JSON, nullable=True)
|
||||
retention_rule_ref = Column(String(50), nullable=True)
|
||||
transfer_mechanism_refs = Column(JSON, nullable=True)
|
||||
tom_refs = Column(JSON, nullable=True)
|
||||
|
||||
# Cross-module links
|
||||
linked_loeschfristen_ids = Column(JSON, nullable=True)
|
||||
linked_tom_measure_ids = Column(JSON, nullable=True)
|
||||
|
||||
# Template + risk
|
||||
source_template_id = Column(String(80), nullable=True)
|
||||
risk_score = Column(Integer, nullable=True)
|
||||
art30_completeness = Column(JSON, nullable=True)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
|
||||
@@ -69,7 +69,7 @@ class AnchorFinder:
|
||||
tags_str = " ".join(control.tags[:3]) if control.tags else ""
|
||||
query = f"{control.title} {tags_str}".strip()
|
||||
|
||||
results = await self.rag.search(
|
||||
results = await self.rag.search_with_rerank(
|
||||
query=query,
|
||||
collection="bp_compliance_ce",
|
||||
top_k=15,
|
||||
|
||||
80
backend-compliance/compliance/services/assertion_engine.py
Normal file
80
backend-compliance/compliance/services/assertion_engine.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Assertion Engine — splits text into sentences and classifies each.
|
||||
|
||||
Each sentence is tagged as:
|
||||
- assertion: normative statement (pflicht / empfehlung / kann)
|
||||
- fact: references concrete evidence artifacts
|
||||
- rationale: explains why something is required
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from .normative_patterns import (
|
||||
PFLICHT_RE, EMPFEHLUNG_RE, KANN_RE, RATIONALE_RE, EVIDENCE_RE,
|
||||
)
|
||||
|
||||
# Sentence splitter: period/excl/question followed by space+uppercase, or newlines
|
||||
_SENTENCE_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ])|(?:\n\s*\n)')
|
||||
|
||||
|
||||
def extract_assertions(
|
||||
text: str,
|
||||
entity_type: str,
|
||||
entity_id: str,
|
||||
tenant_id: Optional[str] = None,
|
||||
) -> list[dict]:
|
||||
"""Split *text* into sentences and classify each one.
|
||||
|
||||
Returns a list of dicts ready for AssertionDB creation.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
sentences = _SENTENCE_SPLIT.split(text.strip())
|
||||
results: list[dict] = []
|
||||
|
||||
for idx, raw in enumerate(sentences):
|
||||
sentence = raw.strip()
|
||||
if not sentence or len(sentence) < 5:
|
||||
continue
|
||||
|
||||
assertion_type, normative_tier = _classify_sentence(sentence)
|
||||
|
||||
results.append({
|
||||
"tenant_id": tenant_id,
|
||||
"entity_type": entity_type,
|
||||
"entity_id": entity_id,
|
||||
"sentence_text": sentence,
|
||||
"sentence_index": idx,
|
||||
"assertion_type": assertion_type,
|
||||
"normative_tier": normative_tier,
|
||||
"evidence_ids": [],
|
||||
"confidence": 0.0,
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _classify_sentence(sentence: str) -> tuple[str, Optional[str]]:
|
||||
"""Return (assertion_type, normative_tier) for a single sentence."""
|
||||
|
||||
# 1. Check for evidence/fact keywords first
|
||||
if EVIDENCE_RE.search(sentence):
|
||||
return ("fact", None)
|
||||
|
||||
# 2. Check for rationale
|
||||
normative_count = len(PFLICHT_RE.findall(sentence)) + len(EMPFEHLUNG_RE.findall(sentence)) + len(KANN_RE.findall(sentence))
|
||||
rationale_count = len(RATIONALE_RE.findall(sentence))
|
||||
if rationale_count > 0 and rationale_count >= normative_count:
|
||||
return ("rationale", None)
|
||||
|
||||
# 3. Normative classification
|
||||
if PFLICHT_RE.search(sentence):
|
||||
return ("assertion", "pflicht")
|
||||
if EMPFEHLUNG_RE.search(sentence):
|
||||
return ("assertion", "empfehlung")
|
||||
if KANN_RE.search(sentence):
|
||||
return ("assertion", "kann")
|
||||
|
||||
# 4. Default: unclassified assertion
|
||||
return ("assertion", None)
|
||||
618
backend-compliance/compliance/services/batch_dedup_runner.py
Normal file
618
backend-compliance/compliance/services/batch_dedup_runner.py
Normal file
@@ -0,0 +1,618 @@
|
||||
"""Batch Dedup Runner — Orchestrates deduplication of ~85k atomare Controls.
|
||||
|
||||
Reduces Pass 0b controls from ~85k to ~18-25k unique Master Controls via:
|
||||
Phase 1: Intra-Group Dedup — same merge_group_hint → pick best, link rest
|
||||
(85k → ~52k, mostly title-identical short-circuit, no embeddings)
|
||||
Phase 2: Cross-Group Dedup — embed masters, search Qdrant for similar
|
||||
masters with different hints (52k → ~18-25k)
|
||||
|
||||
All Pass 0b controls have pattern_id=NULL. The primary grouping key is
|
||||
merge_group_hint (format: "action_type:norm_obj:trigger_key"), which
|
||||
encodes the normalized action, object, and trigger.
|
||||
|
||||
Usage:
|
||||
runner = BatchDedupRunner(db)
|
||||
stats = await runner.run(dry_run=True) # preview
|
||||
stats = await runner.run(dry_run=False) # execute
|
||||
stats = await runner.run(hint_filter="implement:multi_factor_auth:none")
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from compliance.services.control_dedup import (
|
||||
canonicalize_text,
|
||||
ensure_qdrant_collection,
|
||||
get_embedding,
|
||||
normalize_action,
|
||||
normalize_object,
|
||||
qdrant_search_cross_regulation,
|
||||
qdrant_upsert,
|
||||
LINK_THRESHOLD,
|
||||
REVIEW_THRESHOLD,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEDUP_COLLECTION = "atomic_controls_dedup"
|
||||
|
||||
|
||||
# ── Quality Score ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def quality_score(control: dict) -> float:
|
||||
"""Score a control by richness of requirements, tests, evidence, and objective.
|
||||
|
||||
Higher score = better candidate for master control.
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
reqs = control.get("requirements") or "[]"
|
||||
if isinstance(reqs, str):
|
||||
try:
|
||||
reqs = json.loads(reqs)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
reqs = []
|
||||
score += len(reqs) * 2.0
|
||||
|
||||
tests = control.get("test_procedure") or "[]"
|
||||
if isinstance(tests, str):
|
||||
try:
|
||||
tests = json.loads(tests)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
tests = []
|
||||
score += len(tests) * 1.5
|
||||
|
||||
evidence = control.get("evidence") or "[]"
|
||||
if isinstance(evidence, str):
|
||||
try:
|
||||
evidence = json.loads(evidence)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
evidence = []
|
||||
score += len(evidence) * 1.0
|
||||
|
||||
objective = control.get("objective") or ""
|
||||
score += min(len(objective) / 200, 3.0)
|
||||
|
||||
return score
|
||||
|
||||
|
||||
# ── Batch Dedup Runner ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
class BatchDedupRunner:
|
||||
"""Batch dedup orchestrator for existing Pass 0b atomic controls."""
|
||||
|
||||
def __init__(self, db, collection: str = DEDUP_COLLECTION):
|
||||
self.db = db
|
||||
self.collection = collection
|
||||
self.stats = {
|
||||
"total_controls": 0,
|
||||
"unique_hints": 0,
|
||||
"phase1_groups_processed": 0,
|
||||
"masters": 0,
|
||||
"linked": 0,
|
||||
"review": 0,
|
||||
"new_controls": 0,
|
||||
"parent_links_transferred": 0,
|
||||
"cross_group_linked": 0,
|
||||
"cross_group_review": 0,
|
||||
"errors": 0,
|
||||
"skipped_title_identical": 0,
|
||||
}
|
||||
self._progress_phase = ""
|
||||
self._progress_count = 0
|
||||
self._progress_total = 0
|
||||
|
||||
async def run(
|
||||
self,
|
||||
dry_run: bool = False,
|
||||
hint_filter: str = None,
|
||||
) -> dict:
|
||||
"""Run the full batch dedup pipeline.
|
||||
|
||||
Args:
|
||||
dry_run: If True, compute stats but don't modify DB/Qdrant.
|
||||
hint_filter: If set, only process groups matching this hint prefix.
|
||||
|
||||
Returns:
|
||||
Stats dict with counts.
|
||||
"""
|
||||
start = time.monotonic()
|
||||
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)",
|
||||
dry_run, hint_filter)
|
||||
|
||||
if not dry_run:
|
||||
await ensure_qdrant_collection(collection=self.collection)
|
||||
|
||||
# Phase 1: Intra-group dedup (same merge_group_hint)
|
||||
self._progress_phase = "phase1"
|
||||
groups = self._load_merge_groups(hint_filter)
|
||||
self._progress_total = self.stats["total_controls"]
|
||||
|
||||
for hint, controls in groups:
|
||||
try:
|
||||
await self._process_hint_group(hint, controls, dry_run)
|
||||
self.stats["phase1_groups_processed"] += 1
|
||||
except Exception as e:
|
||||
logger.error("BatchDedup Phase 1 error on hint %s: %s", hint, e)
|
||||
self.stats["errors"] += 1
|
||||
try:
|
||||
self.db.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info(
|
||||
"BatchDedup Phase 1 done: %d masters, %d linked, %d review",
|
||||
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
||||
)
|
||||
|
||||
# Phase 2: Cross-group dedup via embeddings
|
||||
if not dry_run:
|
||||
self._progress_phase = "phase2"
|
||||
await self._run_cross_group_pass()
|
||||
|
||||
elapsed = time.monotonic() - start
|
||||
self.stats["elapsed_seconds"] = round(elapsed, 1)
|
||||
logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
|
||||
return self.stats
|
||||
|
||||
def _load_merge_groups(self, hint_filter: str = None) -> list:
|
||||
"""Load all Pass 0b controls grouped by merge_group_hint, largest first."""
|
||||
conditions = [
|
||||
"decomposition_method = 'pass0b'",
|
||||
"release_state != 'deprecated'",
|
||||
"release_state != 'duplicate'",
|
||||
]
|
||||
params = {}
|
||||
|
||||
if hint_filter:
|
||||
conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf")
|
||||
params["hf"] = f"{hint_filter}%"
|
||||
|
||||
where = " AND ".join(conditions)
|
||||
rows = self.db.execute(text(f"""
|
||||
SELECT id::text, control_id, title, objective,
|
||||
pattern_id, requirements::text, test_procedure::text,
|
||||
evidence::text, release_state,
|
||||
generation_metadata->>'merge_group_hint' as merge_group_hint,
|
||||
generation_metadata->>'action_object_class' as action_object_class
|
||||
FROM canonical_controls
|
||||
WHERE {where}
|
||||
ORDER BY control_id
|
||||
"""), params).fetchall()
|
||||
|
||||
by_hint = defaultdict(list)
|
||||
for r in rows:
|
||||
by_hint[r[9] or ""].append({
|
||||
"uuid": r[0],
|
||||
"control_id": r[1],
|
||||
"title": r[2],
|
||||
"objective": r[3],
|
||||
"pattern_id": r[4],
|
||||
"requirements": r[5],
|
||||
"test_procedure": r[6],
|
||||
"evidence": r[7],
|
||||
"release_state": r[8],
|
||||
"merge_group_hint": r[9] or "",
|
||||
"action_object_class": r[10] or "",
|
||||
})
|
||||
|
||||
self.stats["total_controls"] = len(rows)
|
||||
self.stats["unique_hints"] = len(by_hint)
|
||||
|
||||
sorted_groups = sorted(by_hint.items(), key=lambda x: len(x[1]), reverse=True)
|
||||
logger.info("BatchDedup loaded %d controls in %d hint groups",
|
||||
len(rows), len(sorted_groups))
|
||||
return sorted_groups
|
||||
|
||||
def _sub_group_by_merge_hint(self, controls: list) -> dict:
|
||||
"""Group controls by merge_group_hint composite key."""
|
||||
groups = defaultdict(list)
|
||||
for c in controls:
|
||||
hint = c["merge_group_hint"]
|
||||
if hint:
|
||||
groups[hint].append(c)
|
||||
else:
|
||||
groups[f"__no_hint_{c['uuid']}"].append(c)
|
||||
return dict(groups)
|
||||
|
||||
async def _process_hint_group(
|
||||
self,
|
||||
hint: str,
|
||||
controls: list,
|
||||
dry_run: bool,
|
||||
):
|
||||
"""Process all controls sharing the same merge_group_hint.
|
||||
|
||||
Within a hint group, all controls share action+object+trigger.
|
||||
The best-quality control becomes master, rest are linked as duplicates.
|
||||
"""
|
||||
if len(controls) < 2:
|
||||
# Singleton → always master
|
||||
self.stats["masters"] += 1
|
||||
if not dry_run:
|
||||
await self._embed_and_index(controls[0])
|
||||
self._progress_count += 1
|
||||
self._log_progress(hint)
|
||||
return
|
||||
|
||||
# Sort by quality score (best first)
|
||||
sorted_group = sorted(controls, key=quality_score, reverse=True)
|
||||
master = sorted_group[0]
|
||||
self.stats["masters"] += 1
|
||||
|
||||
if not dry_run:
|
||||
await self._embed_and_index(master)
|
||||
|
||||
for candidate in sorted_group[1:]:
|
||||
# All share the same hint → check title similarity
|
||||
if candidate["title"].strip().lower() == master["title"].strip().lower():
|
||||
# Identical title → direct link (no embedding needed)
|
||||
self.stats["linked"] += 1
|
||||
self.stats["skipped_title_identical"] += 1
|
||||
if not dry_run:
|
||||
await self._mark_duplicate(master, candidate, confidence=1.0)
|
||||
else:
|
||||
# Different title within same hint → still likely duplicate
|
||||
# Use embedding to verify
|
||||
await self._check_and_link_within_group(master, candidate, dry_run)
|
||||
|
||||
self._progress_count += 1
|
||||
self._log_progress(hint)
|
||||
|
||||
async def _check_and_link_within_group(
|
||||
self,
|
||||
master: dict,
|
||||
candidate: dict,
|
||||
dry_run: bool,
|
||||
):
|
||||
"""Check if candidate (same hint group) is duplicate of master via embedding."""
|
||||
parts = candidate["merge_group_hint"].split(":", 2)
|
||||
action = parts[0] if len(parts) > 0 else ""
|
||||
obj = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
canonical = canonicalize_text(action, obj, candidate["title"])
|
||||
embedding = await get_embedding(canonical)
|
||||
|
||||
if not embedding:
|
||||
# Can't embed → link anyway (same hint = same action+object)
|
||||
self.stats["linked"] += 1
|
||||
if not dry_run:
|
||||
await self._mark_duplicate(master, candidate, confidence=0.90)
|
||||
return
|
||||
|
||||
# Search the dedup collection (unfiltered — pattern_id is NULL)
|
||||
results = await qdrant_search_cross_regulation(
|
||||
embedding, top_k=3, collection=self.collection,
|
||||
)
|
||||
|
||||
if not results:
|
||||
# No Qdrant matches yet (master might not be indexed yet) → link to master
|
||||
self.stats["linked"] += 1
|
||||
if not dry_run:
|
||||
await self._mark_duplicate(master, candidate, confidence=0.90)
|
||||
return
|
||||
|
||||
best = results[0]
|
||||
best_score = best.get("score", 0.0)
|
||||
best_payload = best.get("payload", {})
|
||||
best_uuid = best_payload.get("control_uuid", "")
|
||||
|
||||
if best_score > LINK_THRESHOLD:
|
||||
self.stats["linked"] += 1
|
||||
if not dry_run:
|
||||
await self._mark_duplicate_to(best_uuid, candidate, confidence=best_score)
|
||||
elif best_score > REVIEW_THRESHOLD:
|
||||
self.stats["review"] += 1
|
||||
if not dry_run:
|
||||
self._write_review(candidate, best_payload, best_score)
|
||||
else:
|
||||
# Very different despite same hint → new master
|
||||
self.stats["new_controls"] += 1
|
||||
if not dry_run:
|
||||
await self._index_with_embedding(candidate, embedding)
|
||||
|
||||
async def _run_cross_group_pass(self):
|
||||
"""Phase 2: Find cross-group duplicates among surviving masters.
|
||||
|
||||
After Phase 1, ~52k masters remain. Many have similar semantics
|
||||
despite different merge_group_hints (e.g. different German spellings).
|
||||
This pass embeds all masters and finds near-duplicates via Qdrant.
|
||||
"""
|
||||
logger.info("BatchDedup Phase 2: Cross-group pass starting...")
|
||||
|
||||
rows = self.db.execute(text("""
|
||||
SELECT id::text, control_id, title,
|
||||
generation_metadata->>'merge_group_hint' as merge_group_hint
|
||||
FROM canonical_controls
|
||||
WHERE decomposition_method = 'pass0b'
|
||||
AND release_state != 'duplicate'
|
||||
AND release_state != 'deprecated'
|
||||
ORDER BY control_id
|
||||
""")).fetchall()
|
||||
|
||||
self._progress_total = len(rows)
|
||||
self._progress_count = 0
|
||||
logger.info("BatchDedup Cross-group: %d masters to check", len(rows))
|
||||
cross_linked = 0
|
||||
cross_review = 0
|
||||
|
||||
for i, r in enumerate(rows):
|
||||
uuid = r[0]
|
||||
hint = r[3] or ""
|
||||
parts = hint.split(":", 2)
|
||||
action = parts[0] if len(parts) > 0 else ""
|
||||
obj = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
canonical = canonicalize_text(action, obj, r[2])
|
||||
embedding = await get_embedding(canonical)
|
||||
if not embedding:
|
||||
continue
|
||||
|
||||
results = await qdrant_search_cross_regulation(
|
||||
embedding, top_k=5, collection=self.collection,
|
||||
)
|
||||
if not results:
|
||||
continue
|
||||
|
||||
# Find best match from a DIFFERENT hint group
|
||||
for match in results:
|
||||
match_score = match.get("score", 0.0)
|
||||
match_payload = match.get("payload", {})
|
||||
match_uuid = match_payload.get("control_uuid", "")
|
||||
|
||||
# Skip self-match
|
||||
if match_uuid == uuid:
|
||||
continue
|
||||
|
||||
# Must be a different hint group (otherwise already handled in Phase 1)
|
||||
match_action = match_payload.get("action_normalized", "")
|
||||
match_object = match_payload.get("object_normalized", "")
|
||||
# Simple check: different control UUID is enough
|
||||
if match_score > LINK_THRESHOLD:
|
||||
# Mark the worse one as duplicate
|
||||
try:
|
||||
self.db.execute(text("""
|
||||
UPDATE canonical_controls
|
||||
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
|
||||
WHERE id = CAST(:dup AS uuid)
|
||||
AND release_state != 'duplicate'
|
||||
"""), {"master": match_uuid, "dup": uuid})
|
||||
|
||||
self.db.execute(text("""
|
||||
INSERT INTO control_parent_links
|
||||
(control_uuid, parent_control_uuid, link_type, confidence)
|
||||
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
|
||||
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
||||
"""), {"cu": match_uuid, "pu": uuid, "conf": match_score})
|
||||
|
||||
# Transfer parent links
|
||||
transferred = self._transfer_parent_links(match_uuid, uuid)
|
||||
self.stats["parent_links_transferred"] += transferred
|
||||
|
||||
self.db.commit()
|
||||
cross_linked += 1
|
||||
except Exception as e:
|
||||
logger.error("BatchDedup cross-group link error %s→%s: %s",
|
||||
uuid, match_uuid, e)
|
||||
self.db.rollback()
|
||||
self.stats["errors"] += 1
|
||||
break # Only one cross-link per control
|
||||
elif match_score > REVIEW_THRESHOLD:
|
||||
self._write_review(
|
||||
{"control_id": r[1], "title": r[2], "objective": "",
|
||||
"merge_group_hint": hint, "pattern_id": None},
|
||||
match_payload, match_score,
|
||||
)
|
||||
cross_review += 1
|
||||
break
|
||||
|
||||
self._progress_count = i + 1
|
||||
if (i + 1) % 500 == 0:
|
||||
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
|
||||
i + 1, len(rows), cross_linked, cross_review)
|
||||
|
||||
self.stats["cross_group_linked"] = cross_linked
|
||||
self.stats["cross_group_review"] = cross_review
|
||||
logger.info("BatchDedup Cross-group complete: %d linked, %d review",
|
||||
cross_linked, cross_review)
|
||||
|
||||
# ── Qdrant Helpers ───────────────────────────────────────────────────
|
||||
|
||||
async def _embed_and_index(self, control: dict):
|
||||
"""Compute embedding and index a control in the dedup Qdrant collection."""
|
||||
parts = control["merge_group_hint"].split(":", 2)
|
||||
action = parts[0] if len(parts) > 0 else ""
|
||||
obj = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
norm_action = normalize_action(action)
|
||||
norm_object = normalize_object(obj)
|
||||
canonical = canonicalize_text(action, obj, control["title"])
|
||||
embedding = await get_embedding(canonical)
|
||||
|
||||
if not embedding:
|
||||
return
|
||||
|
||||
await qdrant_upsert(
|
||||
point_id=control["uuid"],
|
||||
embedding=embedding,
|
||||
payload={
|
||||
"control_uuid": control["uuid"],
|
||||
"control_id": control["control_id"],
|
||||
"title": control["title"],
|
||||
"pattern_id": control.get("pattern_id"),
|
||||
"action_normalized": norm_action,
|
||||
"object_normalized": norm_object,
|
||||
"canonical_text": canonical,
|
||||
"merge_group_hint": control["merge_group_hint"],
|
||||
},
|
||||
collection=self.collection,
|
||||
)
|
||||
|
||||
async def _index_with_embedding(self, control: dict, embedding: list):
|
||||
"""Index a control with a pre-computed embedding."""
|
||||
parts = control["merge_group_hint"].split(":", 2)
|
||||
action = parts[0] if len(parts) > 0 else ""
|
||||
obj = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
norm_action = normalize_action(action)
|
||||
norm_object = normalize_object(obj)
|
||||
canonical = canonicalize_text(action, obj, control["title"])
|
||||
|
||||
await qdrant_upsert(
|
||||
point_id=control["uuid"],
|
||||
embedding=embedding,
|
||||
payload={
|
||||
"control_uuid": control["uuid"],
|
||||
"control_id": control["control_id"],
|
||||
"title": control["title"],
|
||||
"pattern_id": control.get("pattern_id"),
|
||||
"action_normalized": norm_action,
|
||||
"object_normalized": norm_object,
|
||||
"canonical_text": canonical,
|
||||
"merge_group_hint": control["merge_group_hint"],
|
||||
},
|
||||
collection=self.collection,
|
||||
)
|
||||
|
||||
# ── DB Write Helpers ─────────────────────────────────────────────────
|
||||
|
||||
async def _mark_duplicate(self, master: dict, candidate: dict, confidence: float):
|
||||
"""Mark candidate as duplicate of master, transfer parent links."""
|
||||
try:
|
||||
self.db.execute(text("""
|
||||
UPDATE canonical_controls
|
||||
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
|
||||
WHERE id = CAST(:cand AS uuid)
|
||||
"""), {"master": master["uuid"], "cand": candidate["uuid"]})
|
||||
|
||||
self.db.execute(text("""
|
||||
INSERT INTO control_parent_links
|
||||
(control_uuid, parent_control_uuid, link_type, confidence)
|
||||
VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
|
||||
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
||||
"""), {"master": master["uuid"], "cand_parent": candidate["uuid"], "conf": confidence})
|
||||
|
||||
transferred = self._transfer_parent_links(master["uuid"], candidate["uuid"])
|
||||
self.stats["parent_links_transferred"] += transferred
|
||||
|
||||
self.db.commit()
|
||||
except Exception as e:
|
||||
logger.error("BatchDedup _mark_duplicate error %s→%s: %s",
|
||||
candidate["uuid"], master["uuid"], e)
|
||||
self.db.rollback()
|
||||
raise
|
||||
|
||||
async def _mark_duplicate_to(self, master_uuid: str, candidate: dict, confidence: float):
|
||||
"""Mark candidate as duplicate of a Qdrant-matched master."""
|
||||
try:
|
||||
self.db.execute(text("""
|
||||
UPDATE canonical_controls
|
||||
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
|
||||
WHERE id = CAST(:cand AS uuid)
|
||||
"""), {"master": master_uuid, "cand": candidate["uuid"]})
|
||||
|
||||
self.db.execute(text("""
|
||||
INSERT INTO control_parent_links
|
||||
(control_uuid, parent_control_uuid, link_type, confidence)
|
||||
VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
|
||||
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
||||
"""), {"master": master_uuid, "cand_parent": candidate["uuid"], "conf": confidence})
|
||||
|
||||
transferred = self._transfer_parent_links(master_uuid, candidate["uuid"])
|
||||
self.stats["parent_links_transferred"] += transferred
|
||||
|
||||
self.db.commit()
|
||||
except Exception as e:
|
||||
logger.error("BatchDedup _mark_duplicate_to error %s→%s: %s",
|
||||
candidate["uuid"], master_uuid, e)
|
||||
self.db.rollback()
|
||||
raise
|
||||
|
||||
def _transfer_parent_links(self, master_uuid: str, duplicate_uuid: str) -> int:
|
||||
"""Move existing parent links from duplicate to master."""
|
||||
rows = self.db.execute(text("""
|
||||
SELECT parent_control_uuid::text, link_type, confidence,
|
||||
source_regulation, source_article, obligation_candidate_id::text
|
||||
FROM control_parent_links
|
||||
WHERE control_uuid = CAST(:dup AS uuid)
|
||||
AND link_type = 'decomposition'
|
||||
"""), {"dup": duplicate_uuid}).fetchall()
|
||||
|
||||
transferred = 0
|
||||
for r in rows:
|
||||
parent_uuid = r[0]
|
||||
if parent_uuid == master_uuid:
|
||||
continue
|
||||
self.db.execute(text("""
|
||||
INSERT INTO control_parent_links
|
||||
(control_uuid, parent_control_uuid, link_type, confidence,
|
||||
source_regulation, source_article, obligation_candidate_id)
|
||||
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), :lt, :conf,
|
||||
:sr, :sa, CAST(:oci AS uuid))
|
||||
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
||||
"""), {
|
||||
"cu": master_uuid,
|
||||
"pu": parent_uuid,
|
||||
"lt": r[1],
|
||||
"conf": float(r[2]) if r[2] else 1.0,
|
||||
"sr": r[3],
|
||||
"sa": r[4],
|
||||
"oci": r[5],
|
||||
})
|
||||
transferred += 1
|
||||
|
||||
return transferred
|
||||
|
||||
def _write_review(self, candidate: dict, matched_payload: dict, score: float):
|
||||
"""Write a dedup review entry for borderline matches."""
|
||||
try:
|
||||
self.db.execute(text("""
|
||||
INSERT INTO control_dedup_reviews
|
||||
(candidate_control_id, candidate_title, candidate_objective,
|
||||
matched_control_uuid, matched_control_id,
|
||||
similarity_score, dedup_stage, dedup_details)
|
||||
VALUES (:ccid, :ct, :co, CAST(:mcu AS uuid), :mci,
|
||||
:ss, 'batch_dedup', CAST(:dd AS jsonb))
|
||||
"""), {
|
||||
"ccid": candidate["control_id"],
|
||||
"ct": candidate["title"],
|
||||
"co": candidate.get("objective", ""),
|
||||
"mcu": matched_payload.get("control_uuid"),
|
||||
"mci": matched_payload.get("control_id"),
|
||||
"ss": score,
|
||||
"dd": json.dumps({
|
||||
"merge_group_hint": candidate.get("merge_group_hint", ""),
|
||||
"pattern_id": candidate.get("pattern_id"),
|
||||
}),
|
||||
})
|
||||
self.db.commit()
|
||||
except Exception as e:
|
||||
logger.error("BatchDedup _write_review error: %s", e)
|
||||
self.db.rollback()
|
||||
raise
|
||||
|
||||
# ── Progress ─────────────────────────────────────────────────────────
|
||||
|
||||
def _log_progress(self, hint: str):
|
||||
"""Log progress every 500 controls."""
|
||||
if self._progress_count > 0 and self._progress_count % 500 == 0:
|
||||
logger.info(
|
||||
"BatchDedup [%s] %d/%d — masters=%d, linked=%d, review=%d",
|
||||
self._progress_phase, self._progress_count, self._progress_total,
|
||||
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
||||
)
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""Return current progress stats (for status endpoint)."""
|
||||
return {
|
||||
"phase": self._progress_phase,
|
||||
"progress": self._progress_count,
|
||||
"total": self._progress_total,
|
||||
**self.stats,
|
||||
}
|
||||
438
backend-compliance/compliance/services/citation_backfill.py
Normal file
438
backend-compliance/compliance/services/citation_backfill.py
Normal file
@@ -0,0 +1,438 @@
|
||||
"""
|
||||
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
|
||||
|
||||
3-tier matching strategy:
|
||||
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
|
||||
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
|
||||
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .rag_client import ComplianceRAGClient, RAGSearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce",
|
||||
"bp_compliance_gesetze",
|
||||
"bp_compliance_datenschutz",
|
||||
"bp_dsfa_corpus",
|
||||
"bp_legal_templates",
|
||||
]
|
||||
|
||||
BACKFILL_SYSTEM_PROMPT = (
|
||||
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
|
||||
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
|
||||
)
|
||||
|
||||
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
|
||||
_SOURCE_ARTICLE_RE = re.compile(
|
||||
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchResult:
|
||||
article: str
|
||||
paragraph: str
|
||||
method: str # "hash", "regex", "llm"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BackfillResult:
|
||||
total_controls: int = 0
|
||||
matched_hash: int = 0
|
||||
matched_regex: int = 0
|
||||
matched_llm: int = 0
|
||||
unmatched: int = 0
|
||||
updated: int = 0
|
||||
errors: list = field(default_factory=list)
|
||||
|
||||
|
||||
class CitationBackfill:
|
||||
"""Backfill article/paragraph into existing control source_citations."""
|
||||
|
||||
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
|
||||
self.db = db
|
||||
self.rag = rag_client
|
||||
self._rag_index: dict[str, RAGSearchResult] = {}
|
||||
|
||||
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
|
||||
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
|
||||
result = BackfillResult()
|
||||
|
||||
# Load controls needing backfill
|
||||
controls = self._load_controls_needing_backfill(limit)
|
||||
result.total_controls = len(controls)
|
||||
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
|
||||
|
||||
if not controls:
|
||||
return result
|
||||
|
||||
# Collect hashes we need to find — only build index for controls with source text
|
||||
needed_hashes: set[str] = set()
|
||||
for ctrl in controls:
|
||||
src = ctrl.get("source_original_text")
|
||||
if src:
|
||||
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
|
||||
|
||||
if needed_hashes:
|
||||
# Build targeted RAG index — only scroll collections that our controls reference
|
||||
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
|
||||
await self._build_rag_index_targeted(controls)
|
||||
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
|
||||
else:
|
||||
logger.info("No source_original_text found — skipping RAG index build")
|
||||
|
||||
# Process each control
|
||||
for i, ctrl in enumerate(controls):
|
||||
if i > 0 and i % 100 == 0:
|
||||
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
|
||||
|
||||
try:
|
||||
match = await self._match_control(ctrl)
|
||||
if match:
|
||||
if match.method == "hash":
|
||||
result.matched_hash += 1
|
||||
elif match.method == "regex":
|
||||
result.matched_regex += 1
|
||||
elif match.method == "llm":
|
||||
result.matched_llm += 1
|
||||
|
||||
if not dry_run:
|
||||
self._update_control(ctrl, match)
|
||||
result.updated += 1
|
||||
else:
|
||||
logger.debug(
|
||||
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
|
||||
ctrl["control_id"], match.article, match.paragraph, match.method,
|
||||
)
|
||||
else:
|
||||
result.unmatched += 1
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
|
||||
logger.error(error_msg)
|
||||
result.errors.append(error_msg)
|
||||
|
||||
if not dry_run:
|
||||
try:
|
||||
self.db.commit()
|
||||
except Exception as e:
|
||||
logger.error("Backfill commit failed: %s", e)
|
||||
result.errors.append(f"Commit failed: {e}")
|
||||
|
||||
logger.info(
|
||||
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
|
||||
result.total_controls, result.matched_hash, result.matched_regex,
|
||||
result.matched_llm, result.unmatched, result.updated,
|
||||
)
|
||||
return result
|
||||
|
||||
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
|
||||
"""Load controls where source_citation exists but lacks separate 'article' key."""
|
||||
query = """
|
||||
SELECT id, control_id, source_citation, source_original_text,
|
||||
generation_metadata, license_rule
|
||||
FROM canonical_controls
|
||||
WHERE license_rule IN (1, 2)
|
||||
AND source_citation IS NOT NULL
|
||||
AND (
|
||||
source_citation->>'article' IS NULL
|
||||
OR source_citation->>'article' = ''
|
||||
)
|
||||
ORDER BY control_id
|
||||
"""
|
||||
if limit > 0:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
result = self.db.execute(text(query))
|
||||
cols = result.keys()
|
||||
controls = []
|
||||
for row in result:
|
||||
ctrl = dict(zip(cols, row))
|
||||
ctrl["id"] = str(ctrl["id"])
|
||||
# Parse JSON fields
|
||||
for jf in ("source_citation", "generation_metadata"):
|
||||
if isinstance(ctrl.get(jf), str):
|
||||
try:
|
||||
ctrl[jf] = json.loads(ctrl[jf])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
ctrl[jf] = {}
|
||||
controls.append(ctrl)
|
||||
return controls
|
||||
|
||||
async def _build_rag_index_targeted(self, controls: list[dict]):
|
||||
"""Build RAG index by scrolling only collections relevant to our controls.
|
||||
|
||||
Uses regulation codes from generation_metadata to identify which collections
|
||||
to search, falling back to all collections only if needed.
|
||||
"""
|
||||
# Determine which collections are relevant based on regulation codes
|
||||
regulation_to_collection = self._map_regulations_to_collections(controls)
|
||||
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
|
||||
|
||||
logger.info("Targeted index: searching %d collections: %s",
|
||||
len(collections_to_search), ", ".join(collections_to_search))
|
||||
|
||||
for collection in collections_to_search:
|
||||
offset = None
|
||||
page = 0
|
||||
seen_offsets: set[str] = set()
|
||||
while True:
|
||||
chunks, next_offset = await self.rag.scroll(
|
||||
collection=collection, offset=offset, limit=200,
|
||||
)
|
||||
if not chunks:
|
||||
break
|
||||
for chunk in chunks:
|
||||
if chunk.text and len(chunk.text.strip()) >= 50:
|
||||
h = hashlib.sha256(chunk.text.encode()).hexdigest()
|
||||
self._rag_index[h] = chunk
|
||||
page += 1
|
||||
if page % 50 == 0:
|
||||
logger.info("Indexing %s: page %d (%d chunks so far)",
|
||||
collection, page, len(self._rag_index))
|
||||
if not next_offset:
|
||||
break
|
||||
if next_offset in seen_offsets:
|
||||
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
|
||||
break
|
||||
seen_offsets.add(next_offset)
|
||||
offset = next_offset
|
||||
|
||||
logger.info("Indexed collection %s: %d pages", collection, page)
|
||||
|
||||
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
|
||||
"""Map regulation codes from controls to likely Qdrant collections."""
|
||||
# Heuristic: regulation code prefix → collection
|
||||
collection_map = {
|
||||
"eu_": "bp_compliance_gesetze",
|
||||
"dsgvo": "bp_compliance_datenschutz",
|
||||
"bdsg": "bp_compliance_gesetze",
|
||||
"ttdsg": "bp_compliance_gesetze",
|
||||
"nist_": "bp_compliance_ce",
|
||||
"owasp": "bp_compliance_ce",
|
||||
"bsi_": "bp_compliance_ce",
|
||||
"enisa": "bp_compliance_ce",
|
||||
"at_": "bp_compliance_recht",
|
||||
"fr_": "bp_compliance_recht",
|
||||
"es_": "bp_compliance_recht",
|
||||
}
|
||||
result: dict[str, str] = {}
|
||||
for ctrl in controls:
|
||||
meta = ctrl.get("generation_metadata") or {}
|
||||
reg = meta.get("source_regulation", "")
|
||||
if not reg:
|
||||
continue
|
||||
for prefix, coll in collection_map.items():
|
||||
if reg.startswith(prefix):
|
||||
result[reg] = coll
|
||||
break
|
||||
else:
|
||||
# Unknown regulation — search all
|
||||
for coll in ALL_COLLECTIONS:
|
||||
result[f"_all_{coll}"] = coll
|
||||
return result
|
||||
|
||||
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""3-tier matching: hash → regex → LLM."""
|
||||
|
||||
# Tier 1: Hash match against RAG index
|
||||
source_text = ctrl.get("source_original_text")
|
||||
if source_text:
|
||||
h = hashlib.sha256(source_text.encode()).hexdigest()
|
||||
chunk = self._rag_index.get(h)
|
||||
if chunk and (chunk.article or chunk.paragraph):
|
||||
return MatchResult(
|
||||
article=chunk.article or "",
|
||||
paragraph=chunk.paragraph or "",
|
||||
method="hash",
|
||||
)
|
||||
|
||||
# Tier 2: Regex parse concatenated source
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
source_str = citation.get("source", "")
|
||||
parsed = _parse_concatenated_source(source_str)
|
||||
if parsed and parsed["article"]:
|
||||
return MatchResult(
|
||||
article=parsed["article"],
|
||||
paragraph="", # Regex can't extract paragraph from concatenated format
|
||||
method="regex",
|
||||
)
|
||||
|
||||
# Tier 3: Ollama LLM
|
||||
if source_text:
|
||||
return await self._llm_match(ctrl)
|
||||
|
||||
return None
|
||||
|
||||
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""Use Ollama to identify article/paragraph from source text."""
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
regulation_name = citation.get("source", "")
|
||||
metadata = ctrl.get("generation_metadata") or {}
|
||||
regulation_code = metadata.get("source_regulation", "")
|
||||
source_text = ctrl.get("source_original_text", "")
|
||||
|
||||
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
|
||||
|
||||
Gesetz: {regulation_name} (Code: {regulation_code})
|
||||
|
||||
Text:
|
||||
---
|
||||
{source_text[:2000]}
|
||||
---
|
||||
|
||||
Antworte NUR mit JSON:
|
||||
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
|
||||
|
||||
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
|
||||
Falls kein Artikel erkennbar ist, setze article auf "".
|
||||
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
|
||||
|
||||
try:
|
||||
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
|
||||
data = _parse_json(raw)
|
||||
if data and (data.get("article") or data.get("paragraph")):
|
||||
return MatchResult(
|
||||
article=data.get("article", ""),
|
||||
paragraph=data.get("paragraph", ""),
|
||||
method="llm",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
|
||||
|
||||
return None
|
||||
|
||||
def _update_control(self, ctrl: dict, match: MatchResult):
|
||||
"""Update source_citation and generation_metadata in DB."""
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
|
||||
# Clean the source name: remove concatenated article if present
|
||||
source_str = citation.get("source", "")
|
||||
parsed = _parse_concatenated_source(source_str)
|
||||
if parsed:
|
||||
citation["source"] = parsed["name"]
|
||||
|
||||
# Add separate article/paragraph fields
|
||||
citation["article"] = match.article
|
||||
citation["paragraph"] = match.paragraph
|
||||
|
||||
# Update generation_metadata
|
||||
metadata = ctrl.get("generation_metadata") or {}
|
||||
if match.article:
|
||||
metadata["source_article"] = match.article
|
||||
metadata["source_paragraph"] = match.paragraph
|
||||
metadata["backfill_method"] = match.method
|
||||
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
self.db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET source_citation = :citation,
|
||||
generation_metadata = :metadata,
|
||||
updated_at = NOW()
|
||||
WHERE id = CAST(:id AS uuid)
|
||||
"""),
|
||||
{
|
||||
"id": ctrl["id"],
|
||||
"citation": json.dumps(citation),
|
||||
"metadata": json.dumps(metadata),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _parse_concatenated_source(source: str) -> Optional[dict]:
|
||||
"""Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
|
||||
|
||||
Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
|
||||
"""
|
||||
if not source:
|
||||
return None
|
||||
|
||||
# Try Art./Artikel pattern
|
||||
m = _SOURCE_ARTICLE_RE.match(source)
|
||||
if m:
|
||||
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
|
||||
|
||||
# Try § pattern
|
||||
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
|
||||
if m2:
|
||||
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"""Call Ollama chat API for backfill matching."""
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
payload = {
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 256},
|
||||
"think": False,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
||||
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
||||
if resp.status_code != 200:
|
||||
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
|
||||
return ""
|
||||
data = resp.json()
|
||||
msg = data.get("message", {})
|
||||
if isinstance(msg, dict):
|
||||
return msg.get("content", "")
|
||||
return data.get("response", str(msg))
|
||||
except Exception as e:
|
||||
logger.error("Ollama backfill request failed: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
def _parse_json(raw: str) -> Optional[dict]:
|
||||
"""Extract JSON object from LLM output."""
|
||||
if not raw:
|
||||
return None
|
||||
# Try direct parse
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Try extracting from markdown code block
|
||||
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
|
||||
if m:
|
||||
try:
|
||||
return json.loads(m.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Try finding first { ... }
|
||||
m = re.search(r"\{[^{}]*\}", raw)
|
||||
if m:
|
||||
try:
|
||||
return json.loads(m.group(0))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
546
backend-compliance/compliance/services/control_composer.py
Normal file
546
backend-compliance/compliance/services/control_composer.py
Normal file
@@ -0,0 +1,546 @@
|
||||
"""Control Composer — Pattern + Obligation → Master Control.
|
||||
|
||||
Takes an obligation (from ObligationExtractor) and a matched control pattern
|
||||
(from PatternMatcher), then uses LLM to compose a structured, actionable
|
||||
Master Control. Replaces the old Stage 3 (STRUCTURE/REFORM) with a
|
||||
pattern-guided approach.
|
||||
|
||||
Three composition modes based on license rules:
|
||||
Rule 1: Obligation + Pattern + original text → full control
|
||||
Rule 2: Obligation + Pattern + original text + citation → control
|
||||
Rule 3: Obligation + Pattern (NO original text) → reformulated control
|
||||
|
||||
Fallback: No pattern match → basic generation (tagged needs_pattern_assignment)
|
||||
|
||||
Part of the Multi-Layer Control Architecture (Phase 6 of 8).
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from compliance.services.obligation_extractor import (
|
||||
ObligationMatch,
|
||||
_llm_ollama,
|
||||
_parse_json,
|
||||
)
|
||||
from compliance.services.pattern_matcher import (
|
||||
ControlPattern,
|
||||
PatternMatchResult,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||
|
||||
# Valid values for generated control fields
|
||||
VALID_SEVERITIES = {"low", "medium", "high", "critical"}
|
||||
VALID_EFFORTS = {"s", "m", "l", "xl"}
|
||||
VALID_VERIFICATION = {"code_review", "document", "tool", "hybrid"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComposedControl:
|
||||
"""A Master Control composed from an obligation + pattern."""
|
||||
|
||||
# Core fields (match canonical_controls schema)
|
||||
control_id: str = ""
|
||||
title: str = ""
|
||||
objective: str = ""
|
||||
rationale: str = ""
|
||||
scope: dict = field(default_factory=dict)
|
||||
requirements: list = field(default_factory=list)
|
||||
test_procedure: list = field(default_factory=list)
|
||||
evidence: list = field(default_factory=list)
|
||||
severity: str = "medium"
|
||||
risk_score: float = 5.0
|
||||
implementation_effort: str = "m"
|
||||
open_anchors: list = field(default_factory=list)
|
||||
release_state: str = "draft"
|
||||
tags: list = field(default_factory=list)
|
||||
# 3-Rule License fields
|
||||
license_rule: Optional[int] = None
|
||||
source_original_text: Optional[str] = None
|
||||
source_citation: Optional[dict] = None
|
||||
customer_visible: bool = True
|
||||
# Classification
|
||||
verification_method: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
target_audience: Optional[list] = None
|
||||
# Pattern + Obligation linkage
|
||||
pattern_id: Optional[str] = None
|
||||
obligation_ids: list = field(default_factory=list)
|
||||
# Metadata
|
||||
generation_metadata: dict = field(default_factory=dict)
|
||||
composition_method: str = "pattern_guided" # pattern_guided | fallback
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Serialize for DB storage or API response."""
|
||||
return {
|
||||
"control_id": self.control_id,
|
||||
"title": self.title,
|
||||
"objective": self.objective,
|
||||
"rationale": self.rationale,
|
||||
"scope": self.scope,
|
||||
"requirements": self.requirements,
|
||||
"test_procedure": self.test_procedure,
|
||||
"evidence": self.evidence,
|
||||
"severity": self.severity,
|
||||
"risk_score": self.risk_score,
|
||||
"implementation_effort": self.implementation_effort,
|
||||
"open_anchors": self.open_anchors,
|
||||
"release_state": self.release_state,
|
||||
"tags": self.tags,
|
||||
"license_rule": self.license_rule,
|
||||
"source_original_text": self.source_original_text,
|
||||
"source_citation": self.source_citation,
|
||||
"customer_visible": self.customer_visible,
|
||||
"verification_method": self.verification_method,
|
||||
"category": self.category,
|
||||
"target_audience": self.target_audience,
|
||||
"pattern_id": self.pattern_id,
|
||||
"obligation_ids": self.obligation_ids,
|
||||
"generation_metadata": self.generation_metadata,
|
||||
"composition_method": self.composition_method,
|
||||
}
|
||||
|
||||
|
||||
class ControlComposer:
|
||||
"""Composes Master Controls from obligations + patterns.
|
||||
|
||||
Usage::
|
||||
|
||||
composer = ControlComposer()
|
||||
|
||||
control = await composer.compose(
|
||||
obligation=obligation_match,
|
||||
pattern_result=pattern_match_result,
|
||||
chunk_text="...",
|
||||
license_rule=1,
|
||||
source_citation={...},
|
||||
)
|
||||
"""
|
||||
|
||||
async def compose(
|
||||
self,
|
||||
obligation: ObligationMatch,
|
||||
pattern_result: PatternMatchResult,
|
||||
chunk_text: Optional[str] = None,
|
||||
license_rule: int = 3,
|
||||
source_citation: Optional[dict] = None,
|
||||
regulation_code: Optional[str] = None,
|
||||
) -> ComposedControl:
|
||||
"""Compose a Master Control from obligation + pattern.
|
||||
|
||||
Args:
|
||||
obligation: The extracted obligation (from ObligationExtractor).
|
||||
pattern_result: The matched pattern (from PatternMatcher).
|
||||
chunk_text: Original RAG chunk text (only used for Rules 1-2).
|
||||
license_rule: 1=free, 2=citation, 3=restricted.
|
||||
source_citation: Citation metadata for Rule 2.
|
||||
regulation_code: Source regulation code.
|
||||
|
||||
Returns:
|
||||
ComposedControl ready for storage.
|
||||
"""
|
||||
pattern = pattern_result.pattern if pattern_result else None
|
||||
|
||||
if pattern:
|
||||
control = await self._compose_with_pattern(
|
||||
obligation, pattern, chunk_text, license_rule, source_citation,
|
||||
)
|
||||
else:
|
||||
control = await self._compose_fallback(
|
||||
obligation, chunk_text, license_rule, source_citation,
|
||||
)
|
||||
|
||||
# Set linkage fields
|
||||
control.pattern_id = pattern.id if pattern else None
|
||||
if obligation.obligation_id:
|
||||
control.obligation_ids = [obligation.obligation_id]
|
||||
|
||||
# Set license fields
|
||||
control.license_rule = license_rule
|
||||
if license_rule in (1, 2) and chunk_text:
|
||||
control.source_original_text = chunk_text
|
||||
if license_rule == 2 and source_citation:
|
||||
control.source_citation = source_citation
|
||||
if license_rule == 3:
|
||||
control.customer_visible = False
|
||||
control.source_original_text = None
|
||||
control.source_citation = None
|
||||
|
||||
# Build metadata
|
||||
control.generation_metadata = {
|
||||
"composition_method": control.composition_method,
|
||||
"pattern_id": control.pattern_id,
|
||||
"pattern_confidence": round(pattern_result.confidence, 3) if pattern_result else 0,
|
||||
"pattern_method": pattern_result.method if pattern_result else "none",
|
||||
"obligation_id": obligation.obligation_id,
|
||||
"obligation_method": obligation.method,
|
||||
"obligation_confidence": round(obligation.confidence, 3),
|
||||
"license_rule": license_rule,
|
||||
"regulation_code": regulation_code,
|
||||
}
|
||||
|
||||
# Validate and fix fields
|
||||
_validate_control(control)
|
||||
|
||||
return control
|
||||
|
||||
async def compose_batch(
|
||||
self,
|
||||
items: list[dict],
|
||||
) -> list[ComposedControl]:
|
||||
"""Compose multiple controls.
|
||||
|
||||
Args:
|
||||
items: List of dicts with keys: obligation, pattern_result,
|
||||
chunk_text, license_rule, source_citation, regulation_code.
|
||||
|
||||
Returns:
|
||||
List of ComposedControl instances.
|
||||
"""
|
||||
results = []
|
||||
for item in items:
|
||||
control = await self.compose(
|
||||
obligation=item["obligation"],
|
||||
pattern_result=item.get("pattern_result", PatternMatchResult()),
|
||||
chunk_text=item.get("chunk_text"),
|
||||
license_rule=item.get("license_rule", 3),
|
||||
source_citation=item.get("source_citation"),
|
||||
regulation_code=item.get("regulation_code"),
|
||||
)
|
||||
results.append(control)
|
||||
return results
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Pattern-guided composition
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
async def _compose_with_pattern(
|
||||
self,
|
||||
obligation: ObligationMatch,
|
||||
pattern: ControlPattern,
|
||||
chunk_text: Optional[str],
|
||||
license_rule: int,
|
||||
source_citation: Optional[dict],
|
||||
) -> ComposedControl:
|
||||
"""Use LLM to fill the pattern template with obligation-specific details."""
|
||||
prompt = _build_compose_prompt(obligation, pattern, chunk_text, license_rule)
|
||||
system_prompt = _compose_system_prompt(license_rule)
|
||||
|
||||
llm_result = await _llm_ollama(prompt, system_prompt)
|
||||
if not llm_result:
|
||||
return self._compose_from_template(obligation, pattern)
|
||||
|
||||
parsed = _parse_json(llm_result)
|
||||
if not parsed:
|
||||
return self._compose_from_template(obligation, pattern)
|
||||
|
||||
control = ComposedControl(
|
||||
title=parsed.get("title", pattern.name_de)[:255],
|
||||
objective=parsed.get("objective", pattern.objective_template),
|
||||
rationale=parsed.get("rationale", pattern.rationale_template),
|
||||
requirements=_ensure_list(parsed.get("requirements", pattern.requirements_template)),
|
||||
test_procedure=_ensure_list(parsed.get("test_procedure", pattern.test_procedure_template)),
|
||||
evidence=_ensure_list(parsed.get("evidence", pattern.evidence_template)),
|
||||
severity=parsed.get("severity", pattern.severity_default),
|
||||
implementation_effort=parsed.get("implementation_effort", pattern.implementation_effort_default),
|
||||
category=parsed.get("category", pattern.category),
|
||||
tags=_ensure_list(parsed.get("tags", pattern.tags)),
|
||||
target_audience=_ensure_list(parsed.get("target_audience", [])),
|
||||
verification_method=parsed.get("verification_method"),
|
||||
open_anchors=_anchors_from_pattern(pattern),
|
||||
composition_method="pattern_guided",
|
||||
)
|
||||
|
||||
return control
|
||||
|
||||
def _compose_from_template(
|
||||
self,
|
||||
obligation: ObligationMatch,
|
||||
pattern: ControlPattern,
|
||||
) -> ComposedControl:
|
||||
"""Fallback: fill template directly without LLM (when LLM fails)."""
|
||||
obl_title = obligation.obligation_title or ""
|
||||
obl_text = obligation.obligation_text or ""
|
||||
|
||||
title = f"{pattern.name_de}"
|
||||
if obl_title:
|
||||
title = f"{pattern.name_de} — {obl_title}"
|
||||
|
||||
objective = pattern.objective_template
|
||||
if obl_text and len(obl_text) > 20:
|
||||
objective = f"{pattern.objective_template} Bezug: {obl_text[:200]}"
|
||||
|
||||
return ComposedControl(
|
||||
title=title[:255],
|
||||
objective=objective,
|
||||
rationale=pattern.rationale_template,
|
||||
requirements=list(pattern.requirements_template),
|
||||
test_procedure=list(pattern.test_procedure_template),
|
||||
evidence=list(pattern.evidence_template),
|
||||
severity=pattern.severity_default,
|
||||
implementation_effort=pattern.implementation_effort_default,
|
||||
category=pattern.category,
|
||||
tags=list(pattern.tags),
|
||||
open_anchors=_anchors_from_pattern(pattern),
|
||||
composition_method="template_only",
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Fallback (no pattern)
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
async def _compose_fallback(
|
||||
self,
|
||||
obligation: ObligationMatch,
|
||||
chunk_text: Optional[str],
|
||||
license_rule: int,
|
||||
source_citation: Optional[dict],
|
||||
) -> ComposedControl:
|
||||
"""Generate a control without a pattern template (old-style)."""
|
||||
prompt = _build_fallback_prompt(obligation, chunk_text, license_rule)
|
||||
system_prompt = _compose_system_prompt(license_rule)
|
||||
|
||||
llm_result = await _llm_ollama(prompt, system_prompt)
|
||||
parsed = _parse_json(llm_result) if llm_result else {}
|
||||
|
||||
obl_text = obligation.obligation_text or ""
|
||||
|
||||
control = ComposedControl(
|
||||
title=parsed.get("title", obl_text[:100] if obl_text else "Untitled Control")[:255],
|
||||
objective=parsed.get("objective", obl_text[:500]),
|
||||
rationale=parsed.get("rationale", "Aus gesetzlicher Pflicht abgeleitet."),
|
||||
requirements=_ensure_list(parsed.get("requirements", [])),
|
||||
test_procedure=_ensure_list(parsed.get("test_procedure", [])),
|
||||
evidence=_ensure_list(parsed.get("evidence", [])),
|
||||
severity=parsed.get("severity", "medium"),
|
||||
implementation_effort=parsed.get("implementation_effort", "m"),
|
||||
category=parsed.get("category"),
|
||||
tags=_ensure_list(parsed.get("tags", [])),
|
||||
target_audience=_ensure_list(parsed.get("target_audience", [])),
|
||||
verification_method=parsed.get("verification_method"),
|
||||
composition_method="fallback",
|
||||
release_state="needs_review",
|
||||
)
|
||||
|
||||
return control
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompt builders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _compose_system_prompt(license_rule: int) -> str:
|
||||
"""Build the system prompt based on license rule."""
|
||||
if license_rule == 3:
|
||||
return (
|
||||
"Du bist ein Security-Compliance-Experte. Deine Aufgabe ist es, "
|
||||
"eigenstaendige Security Controls zu formulieren. "
|
||||
"Du formulierst IMMER in eigenen Worten. "
|
||||
"KOPIERE KEINE Saetze aus dem Quelltext. "
|
||||
"Verwende eigene Begriffe und Struktur. "
|
||||
"NENNE NICHT die Quelle. Keine proprietaeren Bezeichner. "
|
||||
"Antworte NUR mit validem JSON."
|
||||
)
|
||||
return (
|
||||
"Du bist ein Security-Compliance-Experte. "
|
||||
"Erstelle ein praxisorientiertes, umsetzbares Security Control. "
|
||||
"Antworte NUR mit validem JSON."
|
||||
)
|
||||
|
||||
|
||||
def _build_compose_prompt(
|
||||
obligation: ObligationMatch,
|
||||
pattern: ControlPattern,
|
||||
chunk_text: Optional[str],
|
||||
license_rule: int,
|
||||
) -> str:
|
||||
"""Build the LLM prompt for pattern-guided composition."""
|
||||
obl_section = _obligation_section(obligation)
|
||||
pattern_section = _pattern_section(pattern)
|
||||
|
||||
if license_rule == 3:
|
||||
context_section = "KONTEXT: Intern analysiert (keine Quellenangabe)."
|
||||
elif chunk_text:
|
||||
context_section = f"KONTEXT (Originaltext):\n{chunk_text[:2000]}"
|
||||
else:
|
||||
context_section = "KONTEXT: Kein Originaltext verfuegbar."
|
||||
|
||||
return f"""Erstelle ein PRAXISORIENTIERTES Security Control.
|
||||
|
||||
{obl_section}
|
||||
|
||||
{pattern_section}
|
||||
|
||||
{context_section}
|
||||
|
||||
AUFGABE:
|
||||
Fuelle das Muster mit pflicht-spezifischen Details.
|
||||
Das Ergebnis muss UMSETZBAR sein — keine Gesetzesparaphrase.
|
||||
Formuliere konkret und handlungsorientiert.
|
||||
|
||||
Antworte als JSON:
|
||||
{{
|
||||
"title": "Kurzer praegnanter Titel (max 100 Zeichen, deutsch)",
|
||||
"objective": "Was soll erreicht werden? (1-3 Saetze)",
|
||||
"rationale": "Warum ist das wichtig? (1-2 Saetze)",
|
||||
"requirements": ["Konkrete Anforderung 1", "Anforderung 2", ...],
|
||||
"test_procedure": ["Pruefschritt 1", "Pruefschritt 2", ...],
|
||||
"evidence": ["Nachweis 1", "Nachweis 2", ...],
|
||||
"severity": "low|medium|high|critical",
|
||||
"implementation_effort": "s|m|l|xl",
|
||||
"category": "{pattern.category}",
|
||||
"tags": ["tag1", "tag2"],
|
||||
"target_audience": ["unternehmen", "behoerden", "entwickler"],
|
||||
"verification_method": "code_review|document|tool|hybrid"
|
||||
}}"""
|
||||
|
||||
|
||||
def _build_fallback_prompt(
|
||||
obligation: ObligationMatch,
|
||||
chunk_text: Optional[str],
|
||||
license_rule: int,
|
||||
) -> str:
|
||||
"""Build the LLM prompt for fallback composition (no pattern)."""
|
||||
obl_section = _obligation_section(obligation)
|
||||
|
||||
if license_rule == 3:
|
||||
context_section = "KONTEXT: Intern analysiert (keine Quellenangabe)."
|
||||
elif chunk_text:
|
||||
context_section = f"KONTEXT (Originaltext):\n{chunk_text[:2000]}"
|
||||
else:
|
||||
context_section = "KONTEXT: Kein Originaltext verfuegbar."
|
||||
|
||||
return f"""Erstelle ein Security Control aus der folgenden Pflicht.
|
||||
|
||||
{obl_section}
|
||||
|
||||
{context_section}
|
||||
|
||||
AUFGABE:
|
||||
Formuliere ein umsetzbares Security Control.
|
||||
Keine Gesetzesparaphrase — konkrete Massnahmen beschreiben.
|
||||
|
||||
Antworte als JSON:
|
||||
{{
|
||||
"title": "Kurzer praegnanter Titel (max 100 Zeichen, deutsch)",
|
||||
"objective": "Was soll erreicht werden? (1-3 Saetze)",
|
||||
"rationale": "Warum ist das wichtig? (1-2 Saetze)",
|
||||
"requirements": ["Konkrete Anforderung 1", "Anforderung 2", ...],
|
||||
"test_procedure": ["Pruefschritt 1", "Pruefschritt 2", ...],
|
||||
"evidence": ["Nachweis 1", "Nachweis 2", ...],
|
||||
"severity": "low|medium|high|critical",
|
||||
"implementation_effort": "s|m|l|xl",
|
||||
"category": "one of: authentication, encryption, data_protection, etc.",
|
||||
"tags": ["tag1", "tag2"],
|
||||
"target_audience": ["unternehmen"],
|
||||
"verification_method": "code_review|document|tool|hybrid"
|
||||
}}"""
|
||||
|
||||
|
||||
def _obligation_section(obligation: ObligationMatch) -> str:
|
||||
"""Format the obligation for the prompt."""
|
||||
parts = ["PFLICHT (was das Gesetz verlangt):"]
|
||||
if obligation.obligation_title:
|
||||
parts.append(f" Titel: {obligation.obligation_title}")
|
||||
if obligation.obligation_text:
|
||||
parts.append(f" Beschreibung: {obligation.obligation_text[:500]}")
|
||||
if obligation.obligation_id:
|
||||
parts.append(f" ID: {obligation.obligation_id}")
|
||||
if obligation.regulation_id:
|
||||
parts.append(f" Rechtsgrundlage: {obligation.regulation_id}")
|
||||
if not obligation.obligation_text and not obligation.obligation_title:
|
||||
parts.append(" (Keine spezifische Pflicht extrahiert)")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _pattern_section(pattern: ControlPattern) -> str:
|
||||
"""Format the pattern for the prompt."""
|
||||
reqs = "\n ".join(f"- {r}" for r in pattern.requirements_template[:5])
|
||||
tests = "\n ".join(f"- {t}" for t in pattern.test_procedure_template[:3])
|
||||
return f"""MUSTER (wie man es typischerweise umsetzt):
|
||||
Pattern: {pattern.name_de} ({pattern.id})
|
||||
Domain: {pattern.domain}
|
||||
Ziel-Template: {pattern.objective_template}
|
||||
Anforderungs-Template:
|
||||
{reqs}
|
||||
Pruefverfahren-Template:
|
||||
{tests}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _ensure_list(value) -> list:
|
||||
"""Ensure a value is a list of strings."""
|
||||
if isinstance(value, list):
|
||||
return [str(v) for v in value if v]
|
||||
if isinstance(value, str):
|
||||
return [value]
|
||||
return []
|
||||
|
||||
|
||||
def _anchors_from_pattern(pattern: ControlPattern) -> list:
|
||||
"""Convert pattern's open_anchor_refs to control anchor format."""
|
||||
anchors = []
|
||||
for ref in pattern.open_anchor_refs:
|
||||
anchors.append({
|
||||
"framework": ref.get("framework", ""),
|
||||
"control_id": ref.get("ref", ""),
|
||||
"title": "",
|
||||
"alignment_score": 0.8,
|
||||
})
|
||||
return anchors
|
||||
|
||||
|
||||
def _validate_control(control: ComposedControl) -> None:
|
||||
"""Validate and fix control field values."""
|
||||
# Severity
|
||||
if control.severity not in VALID_SEVERITIES:
|
||||
control.severity = "medium"
|
||||
|
||||
# Implementation effort
|
||||
if control.implementation_effort not in VALID_EFFORTS:
|
||||
control.implementation_effort = "m"
|
||||
|
||||
# Verification method
|
||||
if control.verification_method and control.verification_method not in VALID_VERIFICATION:
|
||||
control.verification_method = None
|
||||
|
||||
# Risk score
|
||||
if not (0 <= control.risk_score <= 10):
|
||||
control.risk_score = _severity_to_risk(control.severity)
|
||||
|
||||
# Title length
|
||||
if len(control.title) > 255:
|
||||
control.title = control.title[:252] + "..."
|
||||
|
||||
# Ensure minimum content
|
||||
if not control.objective:
|
||||
control.objective = control.title
|
||||
if not control.rationale:
|
||||
control.rationale = "Aus regulatorischer Anforderung abgeleitet."
|
||||
if not control.requirements:
|
||||
control.requirements = ["Anforderung gemaess Pflichtbeschreibung umsetzen"]
|
||||
if not control.test_procedure:
|
||||
control.test_procedure = ["Umsetzung der Anforderungen pruefen"]
|
||||
if not control.evidence:
|
||||
control.evidence = ["Dokumentation der Umsetzung"]
|
||||
|
||||
|
||||
def _severity_to_risk(severity: str) -> float:
|
||||
"""Map severity to a default risk score."""
|
||||
return {
|
||||
"critical": 9.0,
|
||||
"high": 7.0,
|
||||
"medium": 5.0,
|
||||
"low": 3.0,
|
||||
}.get(severity, 5.0)
|
||||
745
backend-compliance/compliance/services/control_dedup.py
Normal file
745
backend-compliance/compliance/services/control_dedup.py
Normal file
@@ -0,0 +1,745 @@
|
||||
"""Control Deduplication Engine — 4-Stage Matching Pipeline.
|
||||
|
||||
Prevents duplicate atomic controls during Pass 0b by checking candidates
|
||||
against existing controls before insertion.
|
||||
|
||||
Stages:
|
||||
1. Pattern-Gate: pattern_id must match (hard gate)
|
||||
2. Action-Check: normalized action verb must match (hard gate)
|
||||
3. Object-Norm: normalized object must match (soft gate with high threshold)
|
||||
4. Embedding: cosine similarity with tiered thresholds (Qdrant)
|
||||
|
||||
Verdicts:
|
||||
- NEW: create a new atomic control
|
||||
- LINK: add parent link to existing control (similarity > LINK_THRESHOLD)
|
||||
- REVIEW: queue for human review (REVIEW_THRESHOLD < sim < LINK_THRESHOLD)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Callable, Awaitable
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Configuration ────────────────────────────────────────────────────
|
||||
|
||||
DEDUP_ENABLED = os.getenv("DEDUP_ENABLED", "true").lower() == "true"
|
||||
LINK_THRESHOLD = float(os.getenv("DEDUP_LINK_THRESHOLD", "0.92"))
|
||||
REVIEW_THRESHOLD = float(os.getenv("DEDUP_REVIEW_THRESHOLD", "0.85"))
|
||||
LINK_THRESHOLD_DIFF_OBJECT = float(os.getenv("DEDUP_LINK_THRESHOLD_DIFF_OBJ", "0.95"))
|
||||
CROSS_REG_LINK_THRESHOLD = float(os.getenv("DEDUP_CROSS_REG_THRESHOLD", "0.95"))
|
||||
QDRANT_COLLECTION = os.getenv("DEDUP_QDRANT_COLLECTION", "atomic_controls")
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
|
||||
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
|
||||
|
||||
|
||||
# ── Result Dataclass ─────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class DedupResult:
|
||||
"""Outcome of the dedup check."""
|
||||
verdict: str # "new" | "link" | "review"
|
||||
matched_control_uuid: Optional[str] = None
|
||||
matched_control_id: Optional[str] = None
|
||||
matched_title: Optional[str] = None
|
||||
stage: str = "" # which stage decided
|
||||
similarity_score: float = 0.0
|
||||
link_type: str = "dedup_merge" # "dedup_merge" | "cross_regulation"
|
||||
details: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
# ── Action Normalization ─────────────────────────────────────────────
|
||||
|
||||
_ACTION_SYNONYMS: dict[str, str] = {
|
||||
# German → canonical English
|
||||
"implementieren": "implement",
|
||||
"umsetzen": "implement",
|
||||
"einrichten": "implement",
|
||||
"einführen": "implement",
|
||||
"aufbauen": "implement",
|
||||
"bereitstellen": "implement",
|
||||
"aktivieren": "implement",
|
||||
"konfigurieren": "configure",
|
||||
"einstellen": "configure",
|
||||
"parametrieren": "configure",
|
||||
"testen": "test",
|
||||
"prüfen": "test",
|
||||
"überprüfen": "test",
|
||||
"verifizieren": "test",
|
||||
"validieren": "test",
|
||||
"kontrollieren": "test",
|
||||
"auditieren": "audit",
|
||||
"dokumentieren": "document",
|
||||
"protokollieren": "log",
|
||||
"aufzeichnen": "log",
|
||||
"loggen": "log",
|
||||
"überwachen": "monitor",
|
||||
"monitoring": "monitor",
|
||||
"beobachten": "monitor",
|
||||
"schulen": "train",
|
||||
"trainieren": "train",
|
||||
"sensibilisieren": "train",
|
||||
"löschen": "delete",
|
||||
"entfernen": "delete",
|
||||
"verschlüsseln": "encrypt",
|
||||
"sperren": "block",
|
||||
"beschränken": "restrict",
|
||||
"einschränken": "restrict",
|
||||
"begrenzen": "restrict",
|
||||
"autorisieren": "authorize",
|
||||
"genehmigen": "authorize",
|
||||
"freigeben": "authorize",
|
||||
"authentifizieren": "authenticate",
|
||||
"identifizieren": "identify",
|
||||
"melden": "report",
|
||||
"benachrichtigen": "notify",
|
||||
"informieren": "notify",
|
||||
"aktualisieren": "update",
|
||||
"erneuern": "update",
|
||||
"sichern": "backup",
|
||||
"wiederherstellen": "restore",
|
||||
# English passthrough
|
||||
"implement": "implement",
|
||||
"configure": "configure",
|
||||
"test": "test",
|
||||
"verify": "test",
|
||||
"validate": "test",
|
||||
"audit": "audit",
|
||||
"document": "document",
|
||||
"log": "log",
|
||||
"monitor": "monitor",
|
||||
"train": "train",
|
||||
"delete": "delete",
|
||||
"encrypt": "encrypt",
|
||||
"restrict": "restrict",
|
||||
"authorize": "authorize",
|
||||
"authenticate": "authenticate",
|
||||
"report": "report",
|
||||
"update": "update",
|
||||
"backup": "backup",
|
||||
"restore": "restore",
|
||||
}
|
||||
|
||||
|
||||
def normalize_action(action: str) -> str:
|
||||
"""Normalize an action verb to a canonical English form."""
|
||||
if not action:
|
||||
return ""
|
||||
action = action.strip().lower()
|
||||
# Strip German infinitive/conjugation suffixes for lookup
|
||||
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
|
||||
# Try exact match first, then base form
|
||||
if action in _ACTION_SYNONYMS:
|
||||
return _ACTION_SYNONYMS[action]
|
||||
if action_base in _ACTION_SYNONYMS:
|
||||
return _ACTION_SYNONYMS[action_base]
|
||||
# Fuzzy: check if action starts with any known verb
|
||||
for verb, canonical in _ACTION_SYNONYMS.items():
|
||||
if action.startswith(verb) or verb.startswith(action):
|
||||
return canonical
|
||||
return action # fallback: return as-is
|
||||
|
||||
|
||||
# ── Object Normalization ─────────────────────────────────────────────
|
||||
|
||||
_OBJECT_SYNONYMS: dict[str, str] = {
|
||||
# Authentication / Access
|
||||
"mfa": "multi_factor_auth",
|
||||
"multi-faktor-authentifizierung": "multi_factor_auth",
|
||||
"mehrfaktorauthentifizierung": "multi_factor_auth",
|
||||
"multi-factor authentication": "multi_factor_auth",
|
||||
"two-factor": "multi_factor_auth",
|
||||
"2fa": "multi_factor_auth",
|
||||
"passwort": "password_policy",
|
||||
"kennwort": "password_policy",
|
||||
"password": "password_policy",
|
||||
"zugangsdaten": "credentials",
|
||||
"credentials": "credentials",
|
||||
"admin-konten": "privileged_access",
|
||||
"admin accounts": "privileged_access",
|
||||
"administratorkonten": "privileged_access",
|
||||
"privilegierte zugriffe": "privileged_access",
|
||||
"privileged accounts": "privileged_access",
|
||||
"remote-zugriff": "remote_access",
|
||||
"fernzugriff": "remote_access",
|
||||
"remote access": "remote_access",
|
||||
"session": "session_management",
|
||||
"sitzung": "session_management",
|
||||
"sitzungsverwaltung": "session_management",
|
||||
# Encryption
|
||||
"verschlüsselung": "encryption",
|
||||
"encryption": "encryption",
|
||||
"kryptografie": "encryption",
|
||||
"kryptografische verfahren": "encryption",
|
||||
"schlüssel": "key_management",
|
||||
"key management": "key_management",
|
||||
"schlüsselverwaltung": "key_management",
|
||||
"zertifikat": "certificate_management",
|
||||
"certificate": "certificate_management",
|
||||
"tls": "transport_encryption",
|
||||
"ssl": "transport_encryption",
|
||||
"https": "transport_encryption",
|
||||
# Network
|
||||
"firewall": "firewall",
|
||||
"netzwerk": "network_security",
|
||||
"network": "network_security",
|
||||
"vpn": "vpn",
|
||||
"segmentierung": "network_segmentation",
|
||||
"segmentation": "network_segmentation",
|
||||
# Logging / Monitoring
|
||||
"audit-log": "audit_logging",
|
||||
"audit log": "audit_logging",
|
||||
"protokoll": "audit_logging",
|
||||
"logging": "audit_logging",
|
||||
"monitoring": "monitoring",
|
||||
"überwachung": "monitoring",
|
||||
"alerting": "alerting",
|
||||
"alarmierung": "alerting",
|
||||
"siem": "siem",
|
||||
# Data
|
||||
"personenbezogene daten": "personal_data",
|
||||
"personal data": "personal_data",
|
||||
"sensible daten": "sensitive_data",
|
||||
"sensitive data": "sensitive_data",
|
||||
"datensicherung": "backup",
|
||||
"backup": "backup",
|
||||
"wiederherstellung": "disaster_recovery",
|
||||
"disaster recovery": "disaster_recovery",
|
||||
# Policy / Process
|
||||
"richtlinie": "policy",
|
||||
"policy": "policy",
|
||||
"verfahrensanweisung": "procedure",
|
||||
"procedure": "procedure",
|
||||
"prozess": "process",
|
||||
"schulung": "training",
|
||||
"training": "training",
|
||||
"awareness": "awareness",
|
||||
"sensibilisierung": "awareness",
|
||||
# Incident
|
||||
"vorfall": "incident",
|
||||
"incident": "incident",
|
||||
"sicherheitsvorfall": "security_incident",
|
||||
"security incident": "security_incident",
|
||||
# Vulnerability
|
||||
"schwachstelle": "vulnerability",
|
||||
"vulnerability": "vulnerability",
|
||||
"patch": "patch_management",
|
||||
"update": "patch_management",
|
||||
"patching": "patch_management",
|
||||
}
|
||||
|
||||
# Precompile for substring matching (longest first)
|
||||
_OBJECT_KEYS_SORTED = sorted(_OBJECT_SYNONYMS.keys(), key=len, reverse=True)
|
||||
|
||||
|
||||
def normalize_object(obj: str) -> str:
|
||||
"""Normalize a compliance object to a canonical token."""
|
||||
if not obj:
|
||||
return ""
|
||||
obj_lower = obj.strip().lower()
|
||||
# Exact match
|
||||
if obj_lower in _OBJECT_SYNONYMS:
|
||||
return _OBJECT_SYNONYMS[obj_lower]
|
||||
# Substring match (longest first)
|
||||
for phrase in _OBJECT_KEYS_SORTED:
|
||||
if phrase in obj_lower:
|
||||
return _OBJECT_SYNONYMS[phrase]
|
||||
# Fallback: strip articles/prepositions, join with underscore
|
||||
cleaned = re.sub(r"\b(der|die|das|den|dem|des|ein|eine|eines|einem|einen"
|
||||
r"|für|von|zu|auf|in|an|bei|mit|nach|über|unter|the|a|an"
|
||||
r"|for|of|to|on|in|at|by|with)\b", "", obj_lower)
|
||||
tokens = [t for t in cleaned.split() if len(t) > 2]
|
||||
return "_".join(tokens[:4]) if tokens else obj_lower.replace(" ", "_")
|
||||
|
||||
|
||||
# ── Canonicalization ─────────────────────────────────────────────────
|
||||
|
||||
def canonicalize_text(action: str, obj: str, title: str = "") -> str:
|
||||
"""Build a canonical English text for embedding.
|
||||
|
||||
Transforms German compliance text into normalized English tokens
|
||||
for more stable embedding comparisons.
|
||||
"""
|
||||
norm_action = normalize_action(action)
|
||||
norm_object = normalize_object(obj)
|
||||
# Build canonical sentence
|
||||
parts = [norm_action, norm_object]
|
||||
if title:
|
||||
# Add title keywords (stripped of common filler)
|
||||
title_clean = re.sub(
|
||||
r"\b(und|oder|für|von|zu|der|die|das|den|dem|des|ein|eine"
|
||||
r"|bei|mit|nach|gemäß|gem\.|laut|entsprechend)\b",
|
||||
"", title.lower()
|
||||
)
|
||||
title_tokens = [t for t in title_clean.split() if len(t) > 3][:5]
|
||||
if title_tokens:
|
||||
parts.append("for")
|
||||
parts.extend(title_tokens)
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
# ── Embedding Helper ─────────────────────────────────────────────────
|
||||
|
||||
async def get_embedding(text: str) -> list[float]:
|
||||
"""Get embedding vector for a single text via embedding service."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
resp = await client.post(
|
||||
f"{EMBEDDING_URL}/embed",
|
||||
json={"texts": [text]},
|
||||
)
|
||||
embeddings = resp.json().get("embeddings", [])
|
||||
return embeddings[0] if embeddings else []
|
||||
except Exception as e:
|
||||
logger.warning("Embedding failed: %s", e)
|
||||
return []
|
||||
|
||||
|
||||
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
||||
"""Compute cosine similarity between two vectors."""
|
||||
if not a or not b or len(a) != len(b):
|
||||
return 0.0
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
norm_a = sum(x * x for x in a) ** 0.5
|
||||
norm_b = sum(x * x for x in b) ** 0.5
|
||||
if norm_a == 0 or norm_b == 0:
|
||||
return 0.0
|
||||
return dot / (norm_a * norm_b)
|
||||
|
||||
|
||||
# ── Qdrant Helpers ───────────────────────────────────────────────────
|
||||
|
||||
async def qdrant_search(
|
||||
embedding: list[float],
|
||||
pattern_id: str,
|
||||
top_k: int = 10,
|
||||
collection: Optional[str] = None,
|
||||
) -> list[dict]:
|
||||
"""Search Qdrant for similar atomic controls, filtered by pattern_id."""
|
||||
if not embedding:
|
||||
return []
|
||||
coll = collection or QDRANT_COLLECTION
|
||||
body: dict = {
|
||||
"vector": embedding,
|
||||
"limit": top_k,
|
||||
"with_payload": True,
|
||||
"filter": {
|
||||
"must": [
|
||||
{"key": "pattern_id", "match": {"value": pattern_id}}
|
||||
]
|
||||
},
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
resp = await client.post(
|
||||
f"{QDRANT_URL}/collections/{coll}/points/search",
|
||||
json=body,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.warning("Qdrant search failed: %d", resp.status_code)
|
||||
return []
|
||||
return resp.json().get("result", [])
|
||||
except Exception as e:
|
||||
logger.warning("Qdrant search error: %s", e)
|
||||
return []
|
||||
|
||||
|
||||
async def qdrant_search_cross_regulation(
|
||||
embedding: list[float],
|
||||
top_k: int = 5,
|
||||
collection: Optional[str] = None,
|
||||
) -> list[dict]:
|
||||
"""Search Qdrant for similar controls across ALL regulations (no pattern_id filter).
|
||||
|
||||
Used for cross-regulation linking (e.g. DSGVO Art. 25 ↔ NIS2 Art. 21).
|
||||
"""
|
||||
if not embedding:
|
||||
return []
|
||||
coll = collection or QDRANT_COLLECTION
|
||||
body: dict = {
|
||||
"vector": embedding,
|
||||
"limit": top_k,
|
||||
"with_payload": True,
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
resp = await client.post(
|
||||
f"{QDRANT_URL}/collections/{coll}/points/search",
|
||||
json=body,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.warning("Qdrant cross-reg search failed: %d", resp.status_code)
|
||||
return []
|
||||
return resp.json().get("result", [])
|
||||
except Exception as e:
|
||||
logger.warning("Qdrant cross-reg search error: %s", e)
|
||||
return []
|
||||
|
||||
|
||||
async def qdrant_upsert(
|
||||
point_id: str,
|
||||
embedding: list[float],
|
||||
payload: dict,
|
||||
collection: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Upsert a single point into a Qdrant collection."""
|
||||
if not embedding:
|
||||
return False
|
||||
coll = collection or QDRANT_COLLECTION
|
||||
body = {
|
||||
"points": [{
|
||||
"id": point_id,
|
||||
"vector": embedding,
|
||||
"payload": payload,
|
||||
}]
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
resp = await client.put(
|
||||
f"{QDRANT_URL}/collections/{coll}/points",
|
||||
json=body,
|
||||
)
|
||||
return resp.status_code == 200
|
||||
except Exception as e:
|
||||
logger.warning("Qdrant upsert error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
async def ensure_qdrant_collection(
|
||||
vector_size: int = 1024,
|
||||
collection: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Create a Qdrant collection if it doesn't exist (idempotent)."""
|
||||
coll = collection or QDRANT_COLLECTION
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
# Check if exists
|
||||
resp = await client.get(f"{QDRANT_URL}/collections/{coll}")
|
||||
if resp.status_code == 200:
|
||||
return True
|
||||
# Create
|
||||
resp = await client.put(
|
||||
f"{QDRANT_URL}/collections/{coll}",
|
||||
json={
|
||||
"vectors": {"size": vector_size, "distance": "Cosine"},
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
logger.info("Created Qdrant collection: %s", coll)
|
||||
# Create payload indexes
|
||||
for field_name in ["pattern_id", "action_normalized", "object_normalized", "control_id"]:
|
||||
await client.put(
|
||||
f"{QDRANT_URL}/collections/{coll}/index",
|
||||
json={"field_name": field_name, "field_schema": "keyword"},
|
||||
)
|
||||
return True
|
||||
logger.error("Failed to create Qdrant collection: %d", resp.status_code)
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning("Qdrant collection check error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
# ── Main Dedup Checker ───────────────────────────────────────────────
|
||||
|
||||
class ControlDedupChecker:
|
||||
"""4-stage dedup checker for atomic controls.
|
||||
|
||||
Usage:
|
||||
checker = ControlDedupChecker(db_session)
|
||||
result = await checker.check_duplicate(candidate_action, candidate_object, candidate_title, pattern_id)
|
||||
if result.verdict == "link":
|
||||
checker.add_parent_link(result.matched_control_uuid, parent_uuid)
|
||||
elif result.verdict == "review":
|
||||
checker.write_review(candidate, result)
|
||||
else:
|
||||
# Insert new control
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db,
|
||||
embed_fn: Optional[Callable[[str], Awaitable[list[float]]]] = None,
|
||||
search_fn: Optional[Callable] = None,
|
||||
):
|
||||
self.db = db
|
||||
self._embed = embed_fn or get_embedding
|
||||
self._search = search_fn or qdrant_search
|
||||
self._cache: dict[str, list[dict]] = {} # pattern_id → existing controls
|
||||
|
||||
def _load_existing(self, pattern_id: str) -> list[dict]:
|
||||
"""Load existing atomic controls with same pattern_id from DB."""
|
||||
if pattern_id in self._cache:
|
||||
return self._cache[pattern_id]
|
||||
from sqlalchemy import text
|
||||
rows = self.db.execute(text("""
|
||||
SELECT id::text, control_id, title, objective,
|
||||
pattern_id,
|
||||
generation_metadata->>'obligation_type' as obligation_type
|
||||
FROM canonical_controls
|
||||
WHERE parent_control_uuid IS NOT NULL
|
||||
AND release_state != 'deprecated'
|
||||
AND pattern_id = :pid
|
||||
"""), {"pid": pattern_id}).fetchall()
|
||||
result = [
|
||||
{
|
||||
"uuid": r[0], "control_id": r[1], "title": r[2],
|
||||
"objective": r[3], "pattern_id": r[4],
|
||||
"obligation_type": r[5],
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
self._cache[pattern_id] = result
|
||||
return result
|
||||
|
||||
async def check_duplicate(
|
||||
self,
|
||||
action: str,
|
||||
obj: str,
|
||||
title: str,
|
||||
pattern_id: Optional[str],
|
||||
) -> DedupResult:
|
||||
"""Run the 4-stage dedup pipeline + cross-regulation linking.
|
||||
|
||||
Returns DedupResult with verdict: new/link/review.
|
||||
"""
|
||||
# No pattern_id → can't dedup meaningfully
|
||||
if not pattern_id:
|
||||
return DedupResult(verdict="new", stage="no_pattern")
|
||||
|
||||
# Stage 1: Pattern-Gate
|
||||
existing = self._load_existing(pattern_id)
|
||||
if not existing:
|
||||
return DedupResult(
|
||||
verdict="new", stage="pattern_gate",
|
||||
details={"reason": "no existing controls with this pattern_id"},
|
||||
)
|
||||
|
||||
# Stage 2: Action-Check
|
||||
norm_action = normalize_action(action)
|
||||
# We don't have action stored on existing controls from DB directly,
|
||||
# so we use embedding for controls that passed pattern gate.
|
||||
# But we CAN check via generation_metadata if available.
|
||||
|
||||
# Stage 3: Object-Normalization
|
||||
norm_object = normalize_object(obj)
|
||||
|
||||
# Stage 4: Embedding Similarity
|
||||
canonical = canonicalize_text(action, obj, title)
|
||||
embedding = await self._embed(canonical)
|
||||
if not embedding:
|
||||
# Can't compute embedding → default to new
|
||||
return DedupResult(
|
||||
verdict="new", stage="embedding_unavailable",
|
||||
details={"canonical_text": canonical},
|
||||
)
|
||||
|
||||
# Search Qdrant
|
||||
results = await self._search(embedding, pattern_id, top_k=5)
|
||||
|
||||
if not results:
|
||||
# No intra-pattern matches → try cross-regulation
|
||||
return await self._check_cross_regulation(embedding, DedupResult(
|
||||
verdict="new", stage="no_qdrant_matches",
|
||||
details={"canonical_text": canonical, "action": norm_action, "object": norm_object},
|
||||
))
|
||||
|
||||
# Evaluate best match
|
||||
best = results[0]
|
||||
best_score = best.get("score", 0.0)
|
||||
best_payload = best.get("payload", {})
|
||||
best_action = best_payload.get("action_normalized", "")
|
||||
best_object = best_payload.get("object_normalized", "")
|
||||
|
||||
# Action differs → NEW (even if embedding is high)
|
||||
if best_action and norm_action and best_action != norm_action:
|
||||
return await self._check_cross_regulation(embedding, DedupResult(
|
||||
verdict="new", stage="action_mismatch",
|
||||
similarity_score=best_score,
|
||||
matched_control_id=best_payload.get("control_id"),
|
||||
details={
|
||||
"candidate_action": norm_action,
|
||||
"existing_action": best_action,
|
||||
"similarity": best_score,
|
||||
},
|
||||
))
|
||||
|
||||
# Object differs → use higher threshold
|
||||
if best_object and norm_object and best_object != norm_object:
|
||||
if best_score > LINK_THRESHOLD_DIFF_OBJECT:
|
||||
return DedupResult(
|
||||
verdict="link", stage="embedding_diff_object",
|
||||
matched_control_uuid=best_payload.get("control_uuid"),
|
||||
matched_control_id=best_payload.get("control_id"),
|
||||
matched_title=best_payload.get("title"),
|
||||
similarity_score=best_score,
|
||||
details={"candidate_object": norm_object, "existing_object": best_object},
|
||||
)
|
||||
return await self._check_cross_regulation(embedding, DedupResult(
|
||||
verdict="new", stage="object_mismatch_below_threshold",
|
||||
similarity_score=best_score,
|
||||
matched_control_id=best_payload.get("control_id"),
|
||||
details={
|
||||
"candidate_object": norm_object,
|
||||
"existing_object": best_object,
|
||||
"threshold": LINK_THRESHOLD_DIFF_OBJECT,
|
||||
},
|
||||
))
|
||||
|
||||
# Same action + same object → tiered thresholds
|
||||
if best_score > LINK_THRESHOLD:
|
||||
return DedupResult(
|
||||
verdict="link", stage="embedding_match",
|
||||
matched_control_uuid=best_payload.get("control_uuid"),
|
||||
matched_control_id=best_payload.get("control_id"),
|
||||
matched_title=best_payload.get("title"),
|
||||
similarity_score=best_score,
|
||||
)
|
||||
if best_score > REVIEW_THRESHOLD:
|
||||
return DedupResult(
|
||||
verdict="review", stage="embedding_review",
|
||||
matched_control_uuid=best_payload.get("control_uuid"),
|
||||
matched_control_id=best_payload.get("control_id"),
|
||||
matched_title=best_payload.get("title"),
|
||||
similarity_score=best_score,
|
||||
)
|
||||
return await self._check_cross_regulation(embedding, DedupResult(
|
||||
verdict="new", stage="embedding_below_threshold",
|
||||
similarity_score=best_score,
|
||||
details={"threshold": REVIEW_THRESHOLD},
|
||||
))
|
||||
|
||||
async def _check_cross_regulation(
|
||||
self,
|
||||
embedding: list[float],
|
||||
intra_result: DedupResult,
|
||||
) -> DedupResult:
|
||||
"""Second pass: cross-regulation linking for controls deemed 'new'.
|
||||
|
||||
Searches Qdrant WITHOUT pattern_id filter. Uses a higher threshold
|
||||
(0.95) to avoid false positives across regulation boundaries.
|
||||
"""
|
||||
if intra_result.verdict != "new" or not embedding:
|
||||
return intra_result
|
||||
|
||||
cross_results = await qdrant_search_cross_regulation(embedding, top_k=5)
|
||||
if not cross_results:
|
||||
return intra_result
|
||||
|
||||
best = cross_results[0]
|
||||
best_score = best.get("score", 0.0)
|
||||
if best_score > CROSS_REG_LINK_THRESHOLD:
|
||||
best_payload = best.get("payload", {})
|
||||
return DedupResult(
|
||||
verdict="link",
|
||||
stage="cross_regulation",
|
||||
matched_control_uuid=best_payload.get("control_uuid"),
|
||||
matched_control_id=best_payload.get("control_id"),
|
||||
matched_title=best_payload.get("title"),
|
||||
similarity_score=best_score,
|
||||
link_type="cross_regulation",
|
||||
details={
|
||||
"cross_reg_score": best_score,
|
||||
"cross_reg_threshold": CROSS_REG_LINK_THRESHOLD,
|
||||
},
|
||||
)
|
||||
|
||||
return intra_result
|
||||
|
||||
def add_parent_link(
|
||||
self,
|
||||
control_uuid: str,
|
||||
parent_control_uuid: str,
|
||||
link_type: str = "dedup_merge",
|
||||
confidence: float = 0.0,
|
||||
source_regulation: Optional[str] = None,
|
||||
source_article: Optional[str] = None,
|
||||
obligation_candidate_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Add a parent link to an existing atomic control."""
|
||||
from sqlalchemy import text
|
||||
self.db.execute(text("""
|
||||
INSERT INTO control_parent_links
|
||||
(control_uuid, parent_control_uuid, link_type, confidence,
|
||||
source_regulation, source_article, obligation_candidate_id)
|
||||
VALUES (:cu, :pu, :lt, :conf, :sr, :sa, :oci::uuid)
|
||||
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
||||
"""), {
|
||||
"cu": control_uuid,
|
||||
"pu": parent_control_uuid,
|
||||
"lt": link_type,
|
||||
"conf": confidence,
|
||||
"sr": source_regulation,
|
||||
"sa": source_article,
|
||||
"oci": obligation_candidate_id,
|
||||
})
|
||||
self.db.commit()
|
||||
|
||||
def write_review(
|
||||
self,
|
||||
candidate_control_id: str,
|
||||
candidate_title: str,
|
||||
candidate_objective: str,
|
||||
result: DedupResult,
|
||||
parent_control_uuid: Optional[str] = None,
|
||||
obligation_candidate_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Write a dedup review queue entry."""
|
||||
from sqlalchemy import text
|
||||
self.db.execute(text("""
|
||||
INSERT INTO control_dedup_reviews
|
||||
(candidate_control_id, candidate_title, candidate_objective,
|
||||
matched_control_uuid, matched_control_id,
|
||||
similarity_score, dedup_stage, dedup_details,
|
||||
parent_control_uuid, obligation_candidate_id)
|
||||
VALUES (:ccid, :ct, :co, :mcu::uuid, :mci, :ss, :ds,
|
||||
:dd::jsonb, :pcu::uuid, :oci)
|
||||
"""), {
|
||||
"ccid": candidate_control_id,
|
||||
"ct": candidate_title,
|
||||
"co": candidate_objective,
|
||||
"mcu": result.matched_control_uuid,
|
||||
"mci": result.matched_control_id,
|
||||
"ss": result.similarity_score,
|
||||
"ds": result.stage,
|
||||
"dd": __import__("json").dumps(result.details),
|
||||
"pcu": parent_control_uuid,
|
||||
"oci": obligation_candidate_id,
|
||||
})
|
||||
self.db.commit()
|
||||
|
||||
async def index_control(
|
||||
self,
|
||||
control_uuid: str,
|
||||
control_id: str,
|
||||
title: str,
|
||||
action: str,
|
||||
obj: str,
|
||||
pattern_id: str,
|
||||
collection: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Index a new atomic control in Qdrant for future dedup checks."""
|
||||
norm_action = normalize_action(action)
|
||||
norm_object = normalize_object(obj)
|
||||
canonical = canonicalize_text(action, obj, title)
|
||||
embedding = await self._embed(canonical)
|
||||
if not embedding:
|
||||
return False
|
||||
return await qdrant_upsert(
|
||||
point_id=control_uuid,
|
||||
embedding=embedding,
|
||||
payload={
|
||||
"control_uuid": control_uuid,
|
||||
"control_id": control_id,
|
||||
"title": title,
|
||||
"pattern_id": pattern_id,
|
||||
"action_normalized": norm_action,
|
||||
"object_normalized": norm_object,
|
||||
"canonical_text": canonical,
|
||||
},
|
||||
collection=collection,
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
152
backend-compliance/compliance/services/control_status_machine.py
Normal file
152
backend-compliance/compliance/services/control_status_machine.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""
|
||||
Control Status Transition State Machine.
|
||||
|
||||
Enforces that controls cannot be set to "pass" without sufficient evidence.
|
||||
Prevents Compliance-Theater where controls claim compliance without real proof.
|
||||
|
||||
Transition rules:
|
||||
planned → in_progress : always allowed
|
||||
in_progress → pass : requires ≥1 evidence with confidence ≥ E2 and
|
||||
truth_status in (uploaded, observed, validated_internal)
|
||||
in_progress → partial : requires ≥1 evidence (any level)
|
||||
pass → fail : always allowed (degradation)
|
||||
any → n/a : requires status_justification
|
||||
any → planned : always allowed (reset)
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ..db.models import EvidenceDB
|
||||
|
||||
|
||||
# Confidence level ordering for comparisons
|
||||
CONFIDENCE_ORDER = {"E0": 0, "E1": 1, "E2": 2, "E3": 3, "E4": 4}
|
||||
|
||||
# Truth statuses that qualify as "real" evidence for pass transitions
|
||||
VALID_TRUTH_STATUSES = {"uploaded", "observed", "validated_internal", "accepted_by_auditor", "provided_to_auditor"}
|
||||
|
||||
|
||||
def validate_transition(
|
||||
current_status: str,
|
||||
new_status: str,
|
||||
evidence_list: Optional[List[EvidenceDB]] = None,
|
||||
status_justification: Optional[str] = None,
|
||||
bypass_for_auto_updater: bool = False,
|
||||
) -> Tuple[bool, List[str]]:
|
||||
"""
|
||||
Validate whether a control status transition is allowed.
|
||||
|
||||
Args:
|
||||
current_status: Current control status value (e.g. "planned", "pass")
|
||||
new_status: Requested new status
|
||||
evidence_list: List of EvidenceDB objects linked to this control
|
||||
status_justification: Text justification (required for n/a transitions)
|
||||
bypass_for_auto_updater: If True, skip evidence checks (used by CI/CD auto-updater
|
||||
which creates evidence atomically with status change)
|
||||
|
||||
Returns:
|
||||
Tuple of (allowed: bool, violations: list[str])
|
||||
"""
|
||||
violations: List[str] = []
|
||||
evidence_list = evidence_list or []
|
||||
|
||||
# Same status → no-op, always allowed
|
||||
if current_status == new_status:
|
||||
return True, []
|
||||
|
||||
# Reset to planned is always allowed
|
||||
if new_status == "planned":
|
||||
return True, []
|
||||
|
||||
# n/a requires justification
|
||||
if new_status == "n/a":
|
||||
if not status_justification or not status_justification.strip():
|
||||
violations.append("Transition to 'n/a' requires a status_justification explaining why this control is not applicable.")
|
||||
return len(violations) == 0, violations
|
||||
|
||||
# Degradation: pass → fail is always allowed
|
||||
if current_status == "pass" and new_status == "fail":
|
||||
return True, []
|
||||
|
||||
# planned → in_progress: always allowed
|
||||
if current_status == "planned" and new_status == "in_progress":
|
||||
return True, []
|
||||
|
||||
# in_progress → partial: needs at least 1 evidence
|
||||
if new_status == "partial":
|
||||
if not bypass_for_auto_updater and len(evidence_list) == 0:
|
||||
violations.append("Transition to 'partial' requires at least 1 evidence record.")
|
||||
return len(violations) == 0, violations
|
||||
|
||||
# in_progress → pass: strict requirements
|
||||
if new_status == "pass":
|
||||
if bypass_for_auto_updater:
|
||||
return True, []
|
||||
|
||||
if len(evidence_list) == 0:
|
||||
violations.append("Transition to 'pass' requires at least 1 evidence record.")
|
||||
return False, violations
|
||||
|
||||
# Check for at least one qualifying evidence
|
||||
has_qualifying = False
|
||||
for e in evidence_list:
|
||||
conf = getattr(e, "confidence_level", None)
|
||||
truth = getattr(e, "truth_status", None)
|
||||
|
||||
# Get string values from enum or string
|
||||
conf_val = conf.value if hasattr(conf, "value") else str(conf) if conf else "E1"
|
||||
truth_val = truth.value if hasattr(truth, "value") else str(truth) if truth else "uploaded"
|
||||
|
||||
if CONFIDENCE_ORDER.get(conf_val, 1) >= CONFIDENCE_ORDER["E2"] and truth_val in VALID_TRUTH_STATUSES:
|
||||
has_qualifying = True
|
||||
break
|
||||
|
||||
if not has_qualifying:
|
||||
violations.append(
|
||||
"Transition to 'pass' requires at least 1 evidence with confidence >= E2 "
|
||||
"and truth_status in (uploaded, observed, validated_internal, accepted_by_auditor). "
|
||||
"Current evidence does not meet this threshold."
|
||||
)
|
||||
|
||||
return len(violations) == 0, violations
|
||||
|
||||
# in_progress → fail: always allowed
|
||||
if new_status == "fail":
|
||||
return True, []
|
||||
|
||||
# Any other transition from planned/fail to pass requires going through in_progress
|
||||
if current_status in ("planned", "fail") and new_status == "pass":
|
||||
if bypass_for_auto_updater:
|
||||
return True, []
|
||||
violations.append(
|
||||
f"Direct transition from '{current_status}' to 'pass' is not allowed. "
|
||||
f"Move to 'in_progress' first, then to 'pass' with qualifying evidence."
|
||||
)
|
||||
return False, violations
|
||||
|
||||
# Default: allow other transitions (e.g. fail → partial, partial → pass)
|
||||
# For partial → pass, apply the same evidence checks
|
||||
if current_status == "partial" and new_status == "pass":
|
||||
if bypass_for_auto_updater:
|
||||
return True, []
|
||||
|
||||
has_qualifying = False
|
||||
for e in evidence_list:
|
||||
conf = getattr(e, "confidence_level", None)
|
||||
truth = getattr(e, "truth_status", None)
|
||||
conf_val = conf.value if hasattr(conf, "value") else str(conf) if conf else "E1"
|
||||
truth_val = truth.value if hasattr(truth, "value") else str(truth) if truth else "uploaded"
|
||||
|
||||
if CONFIDENCE_ORDER.get(conf_val, 1) >= CONFIDENCE_ORDER["E2"] and truth_val in VALID_TRUTH_STATUSES:
|
||||
has_qualifying = True
|
||||
break
|
||||
|
||||
if not has_qualifying:
|
||||
violations.append(
|
||||
"Transition from 'partial' to 'pass' requires at least 1 evidence with confidence >= E2 "
|
||||
"and truth_status in (uploaded, observed, validated_internal, accepted_by_auditor)."
|
||||
)
|
||||
return len(violations) == 0, violations
|
||||
|
||||
# All other transitions allowed
|
||||
return True, []
|
||||
3877
backend-compliance/compliance/services/decomposition_pass.py
Normal file
3877
backend-compliance/compliance/services/decomposition_pass.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,714 @@
|
||||
"""Framework Decomposition Engine — decomposes framework-container obligations.
|
||||
|
||||
Sits between Pass 0a (obligation extraction) and Pass 0b (atomic control
|
||||
composition). Detects obligations that reference a framework domain (e.g.
|
||||
"CCM-Praktiken fuer AIS") and decomposes them into concrete sub-obligations
|
||||
using an internal framework registry.
|
||||
|
||||
Three routing types:
|
||||
atomic → pass through to Pass 0b unchanged
|
||||
compound → split compound verbs, then Pass 0b
|
||||
framework_container → decompose via registry, then Pass 0b
|
||||
|
||||
The registry is a set of JSON files under compliance/data/frameworks/.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry loading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_REGISTRY_DIR = Path(__file__).resolve().parent.parent / "data" / "frameworks"
|
||||
_REGISTRY: dict[str, dict] = {} # framework_id → framework dict
|
||||
|
||||
|
||||
def _load_registry() -> dict[str, dict]:
|
||||
"""Load all framework JSON files from the registry directory."""
|
||||
registry: dict[str, dict] = {}
|
||||
if not _REGISTRY_DIR.is_dir():
|
||||
logger.warning("Framework registry dir not found: %s", _REGISTRY_DIR)
|
||||
return registry
|
||||
|
||||
for fpath in sorted(_REGISTRY_DIR.glob("*.json")):
|
||||
try:
|
||||
with open(fpath, encoding="utf-8") as f:
|
||||
fw = json.load(f)
|
||||
fw_id = fw.get("framework_id", fpath.stem)
|
||||
registry[fw_id] = fw
|
||||
logger.info(
|
||||
"Loaded framework: %s (%d domains)",
|
||||
fw_id,
|
||||
len(fw.get("domains", [])),
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Failed to load framework file: %s", fpath)
|
||||
return registry
|
||||
|
||||
|
||||
def get_registry() -> dict[str, dict]:
|
||||
"""Return the global framework registry (lazy-loaded)."""
|
||||
global _REGISTRY
|
||||
if not _REGISTRY:
|
||||
_REGISTRY = _load_registry()
|
||||
return _REGISTRY
|
||||
|
||||
|
||||
def reload_registry() -> dict[str, dict]:
|
||||
"""Force-reload the framework registry from disk."""
|
||||
global _REGISTRY
|
||||
_REGISTRY = _load_registry()
|
||||
return _REGISTRY
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Framework alias index (built from registry)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_alias_index(registry: dict[str, dict]) -> dict[str, str]:
|
||||
"""Build a lowercase alias → framework_id lookup."""
|
||||
idx: dict[str, str] = {}
|
||||
for fw_id, fw in registry.items():
|
||||
# Framework-level aliases
|
||||
idx[fw_id.lower()] = fw_id
|
||||
name = fw.get("display_name", "")
|
||||
if name:
|
||||
idx[name.lower()] = fw_id
|
||||
# Common short forms
|
||||
for part in fw_id.lower().replace("_", " ").split():
|
||||
if len(part) >= 3:
|
||||
idx[part] = fw_id
|
||||
return idx
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Routing — classify obligation type
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Extended patterns for framework detection (beyond the simple _COMPOSITE_RE
|
||||
# in decomposition_pass.py — here we also capture the framework name)
|
||||
_FRAMEWORK_PATTERN = re.compile(
|
||||
r"(?:praktiken|kontrollen|ma(?:ss|ß)nahmen|anforderungen|vorgaben|controls|practices|measures|requirements)"
|
||||
r"\s+(?:f(?:ue|ü)r|aus|gem(?:ae|ä)(?:ss|ß)|nach|from|of|for|per)\s+"
|
||||
r"(.+?)(?:\s+(?:m(?:ue|ü)ssen|sollen|sind|werden|implementieren|umsetzen|einf(?:ue|ü)hren)|\.|,|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Direct framework name references
|
||||
_DIRECT_FRAMEWORK_RE = re.compile(
|
||||
r"\b(?:CSA\s*CCM|NIST\s*(?:SP\s*)?800-53|OWASP\s*(?:ASVS|SAMM|Top\s*10)"
|
||||
r"|CIS\s*Controls|BSI\s*(?:IT-)?Grundschutz|ENISA|ISO\s*2700[12]"
|
||||
r"|COBIT|SOX|PCI\s*DSS|HITRUST|SOC\s*2|KRITIS)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Compound verb patterns (multiple main verbs)
|
||||
_COMPOUND_VERB_RE = re.compile(
|
||||
r"\b(?:und|sowie|als\s+auch|or|and)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# No-split phrases that look compound but aren't
|
||||
_NO_SPLIT_PHRASES = [
|
||||
"pflegen und aufrechterhalten",
|
||||
"dokumentieren und pflegen",
|
||||
"definieren und dokumentieren",
|
||||
"erstellen und freigeben",
|
||||
"pruefen und genehmigen",
|
||||
"identifizieren und bewerten",
|
||||
"erkennen und melden",
|
||||
"define and maintain",
|
||||
"create and maintain",
|
||||
"establish and maintain",
|
||||
"monitor and review",
|
||||
"detect and respond",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoutingResult:
|
||||
"""Result of obligation routing classification."""
|
||||
routing_type: str # atomic | compound | framework_container | unknown_review
|
||||
framework_ref: Optional[str] = None
|
||||
framework_domain: Optional[str] = None
|
||||
domain_title: Optional[str] = None
|
||||
confidence: float = 0.0
|
||||
reason: str = ""
|
||||
|
||||
|
||||
def classify_routing(
|
||||
obligation_text: str,
|
||||
action_raw: str,
|
||||
object_raw: str,
|
||||
condition_raw: Optional[str] = None,
|
||||
) -> RoutingResult:
|
||||
"""Classify an obligation into atomic / compound / framework_container."""
|
||||
combined = f"{obligation_text} {object_raw}".lower()
|
||||
|
||||
# --- Step 1: Framework container detection ---
|
||||
fw_result = _detect_framework(obligation_text, object_raw)
|
||||
if fw_result.routing_type == "framework_container":
|
||||
return fw_result
|
||||
|
||||
# --- Step 2: Compound verb detection ---
|
||||
if _is_compound_obligation(action_raw, obligation_text):
|
||||
return RoutingResult(
|
||||
routing_type="compound",
|
||||
confidence=0.7,
|
||||
reason="multiple_main_verbs",
|
||||
)
|
||||
|
||||
# --- Step 3: Default = atomic ---
|
||||
return RoutingResult(
|
||||
routing_type="atomic",
|
||||
confidence=0.9,
|
||||
reason="single_action_single_object",
|
||||
)
|
||||
|
||||
|
||||
def _detect_framework(
|
||||
obligation_text: str, object_raw: str,
|
||||
) -> RoutingResult:
|
||||
"""Detect if obligation references a framework domain."""
|
||||
combined = f"{obligation_text} {object_raw}"
|
||||
registry = get_registry()
|
||||
alias_idx = _build_alias_index(registry)
|
||||
|
||||
# Strategy 1: direct framework name match
|
||||
m = _DIRECT_FRAMEWORK_RE.search(combined)
|
||||
if m:
|
||||
fw_name = m.group(0).strip()
|
||||
fw_id = _resolve_framework_id(fw_name, alias_idx, registry)
|
||||
if fw_id:
|
||||
domain_id, domain_title = _match_domain(
|
||||
combined, registry[fw_id],
|
||||
)
|
||||
return RoutingResult(
|
||||
routing_type="framework_container",
|
||||
framework_ref=fw_id,
|
||||
framework_domain=domain_id,
|
||||
domain_title=domain_title,
|
||||
confidence=0.95 if domain_id else 0.75,
|
||||
reason=f"direct_framework_match:{fw_name}",
|
||||
)
|
||||
else:
|
||||
# Framework name recognized but not in registry
|
||||
return RoutingResult(
|
||||
routing_type="framework_container",
|
||||
framework_ref=None,
|
||||
framework_domain=None,
|
||||
confidence=0.6,
|
||||
reason=f"direct_framework_match_no_registry:{fw_name}",
|
||||
)
|
||||
|
||||
# Strategy 2: pattern match ("Praktiken fuer X")
|
||||
m2 = _FRAMEWORK_PATTERN.search(combined)
|
||||
if m2:
|
||||
ref_text = m2.group(1).strip()
|
||||
fw_id, domain_id, domain_title = _resolve_from_ref_text(
|
||||
ref_text, registry, alias_idx,
|
||||
)
|
||||
if fw_id:
|
||||
return RoutingResult(
|
||||
routing_type="framework_container",
|
||||
framework_ref=fw_id,
|
||||
framework_domain=domain_id,
|
||||
domain_title=domain_title,
|
||||
confidence=0.85 if domain_id else 0.65,
|
||||
reason=f"pattern_match:{ref_text}",
|
||||
)
|
||||
|
||||
# Strategy 3: keyword-heavy object
|
||||
if _has_framework_keywords(object_raw):
|
||||
return RoutingResult(
|
||||
routing_type="framework_container",
|
||||
framework_ref=None,
|
||||
framework_domain=None,
|
||||
confidence=0.5,
|
||||
reason="framework_keywords_in_object",
|
||||
)
|
||||
|
||||
return RoutingResult(routing_type="atomic", confidence=0.0)
|
||||
|
||||
|
||||
def _resolve_framework_id(
|
||||
name: str,
|
||||
alias_idx: dict[str, str],
|
||||
registry: dict[str, dict],
|
||||
) -> Optional[str]:
|
||||
"""Resolve a framework name to its registry ID."""
|
||||
normalized = re.sub(r"\s+", " ", name.strip().lower())
|
||||
# Direct alias match
|
||||
if normalized in alias_idx:
|
||||
return alias_idx[normalized]
|
||||
# Try compact form (strip spaces, hyphens, underscores)
|
||||
compact = re.sub(r"[\s_\-]+", "", normalized)
|
||||
for alias, fw_id in alias_idx.items():
|
||||
if re.sub(r"[\s_\-]+", "", alias) == compact:
|
||||
return fw_id
|
||||
# Substring match in display names
|
||||
for fw_id, fw in registry.items():
|
||||
display = fw.get("display_name", "").lower()
|
||||
if normalized in display or display in normalized:
|
||||
return fw_id
|
||||
# Partial match: check if normalized contains any alias (for multi-word refs)
|
||||
for alias, fw_id in alias_idx.items():
|
||||
if len(alias) >= 4 and alias in normalized:
|
||||
return fw_id
|
||||
return None
|
||||
|
||||
|
||||
def _match_domain(
|
||||
text: str, framework: dict,
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Match a domain within a framework from text references."""
|
||||
text_lower = text.lower()
|
||||
best_id: Optional[str] = None
|
||||
best_title: Optional[str] = None
|
||||
best_score = 0
|
||||
|
||||
for domain in framework.get("domains", []):
|
||||
score = 0
|
||||
domain_id = domain["domain_id"]
|
||||
title = domain.get("title", "")
|
||||
|
||||
# Exact domain ID match (e.g. "AIS")
|
||||
if re.search(rf"\b{re.escape(domain_id)}\b", text, re.IGNORECASE):
|
||||
score += 10
|
||||
|
||||
# Full title match
|
||||
if title.lower() in text_lower:
|
||||
score += 8
|
||||
|
||||
# Alias match
|
||||
for alias in domain.get("aliases", []):
|
||||
if alias.lower() in text_lower:
|
||||
score += 6
|
||||
break
|
||||
|
||||
# Keyword overlap
|
||||
kw_hits = sum(
|
||||
1 for kw in domain.get("keywords", [])
|
||||
if kw.lower() in text_lower
|
||||
)
|
||||
score += kw_hits
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_id = domain_id
|
||||
best_title = title
|
||||
|
||||
if best_score >= 3:
|
||||
return best_id, best_title
|
||||
return None, None
|
||||
|
||||
|
||||
def _resolve_from_ref_text(
|
||||
ref_text: str,
|
||||
registry: dict[str, dict],
|
||||
alias_idx: dict[str, str],
|
||||
) -> tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
"""Resolve framework + domain from a reference text like 'AIS' or 'Application Security'."""
|
||||
ref_lower = ref_text.lower()
|
||||
|
||||
for fw_id, fw in registry.items():
|
||||
for domain in fw.get("domains", []):
|
||||
# Check domain ID
|
||||
if domain["domain_id"].lower() in ref_lower:
|
||||
return fw_id, domain["domain_id"], domain.get("title")
|
||||
# Check title
|
||||
if domain.get("title", "").lower() in ref_lower:
|
||||
return fw_id, domain["domain_id"], domain.get("title")
|
||||
# Check aliases
|
||||
for alias in domain.get("aliases", []):
|
||||
if alias.lower() in ref_lower or ref_lower in alias.lower():
|
||||
return fw_id, domain["domain_id"], domain.get("title")
|
||||
|
||||
return None, None, None
|
||||
|
||||
|
||||
_FRAMEWORK_KW_SET = {
|
||||
"praktiken", "kontrollen", "massnahmen", "maßnahmen",
|
||||
"anforderungen", "vorgaben", "framework", "standard",
|
||||
"baseline", "katalog", "domain", "family", "category",
|
||||
"practices", "controls", "measures", "requirements",
|
||||
}
|
||||
|
||||
|
||||
def _has_framework_keywords(text: str) -> bool:
|
||||
"""Check if text contains framework-indicator keywords."""
|
||||
words = set(re.findall(r"[a-zäöüß]+", text.lower()))
|
||||
return len(words & _FRAMEWORK_KW_SET) >= 2
|
||||
|
||||
|
||||
def _is_compound_obligation(action_raw: str, obligation_text: str) -> bool:
|
||||
"""Detect if the obligation has multiple competing main verbs."""
|
||||
if not action_raw:
|
||||
return False
|
||||
|
||||
action_lower = action_raw.lower().strip()
|
||||
|
||||
# Check no-split phrases first
|
||||
for phrase in _NO_SPLIT_PHRASES:
|
||||
if phrase in action_lower:
|
||||
return False
|
||||
|
||||
# Must have a conjunction
|
||||
if not _COMPOUND_VERB_RE.search(action_lower):
|
||||
return False
|
||||
|
||||
# Split by conjunctions and check if we get 2+ meaningful verbs
|
||||
parts = re.split(r"\b(?:und|sowie|als\s+auch|or|and)\b", action_lower)
|
||||
meaningful = [p.strip() for p in parts if len(p.strip()) >= 3]
|
||||
return len(meaningful) >= 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Framework Decomposition
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class DecomposedObligation:
|
||||
"""A concrete obligation derived from a framework container."""
|
||||
obligation_candidate_id: str
|
||||
parent_control_id: str
|
||||
parent_framework_container_id: str
|
||||
source_ref_law: str
|
||||
source_ref_article: str
|
||||
obligation_text: str
|
||||
actor: str
|
||||
action_raw: str
|
||||
object_raw: str
|
||||
condition_raw: Optional[str] = None
|
||||
trigger_raw: Optional[str] = None
|
||||
routing_type: str = "atomic"
|
||||
release_state: str = "decomposed"
|
||||
subcontrol_id: str = ""
|
||||
# Metadata
|
||||
action_hint: str = ""
|
||||
object_hint: str = ""
|
||||
object_class: str = ""
|
||||
keywords: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FrameworkDecompositionResult:
|
||||
"""Result of framework decomposition."""
|
||||
framework_container_id: str
|
||||
source_obligation_candidate_id: str
|
||||
framework_ref: Optional[str]
|
||||
framework_domain: Optional[str]
|
||||
domain_title: Optional[str]
|
||||
matched_subcontrols: list[str]
|
||||
decomposition_confidence: float
|
||||
release_state: str # decomposed | unmatched | error
|
||||
decomposed_obligations: list[DecomposedObligation]
|
||||
issues: list[str]
|
||||
|
||||
|
||||
def decompose_framework_container(
|
||||
obligation_candidate_id: str,
|
||||
parent_control_id: str,
|
||||
obligation_text: str,
|
||||
framework_ref: Optional[str],
|
||||
framework_domain: Optional[str],
|
||||
actor: str = "organization",
|
||||
) -> FrameworkDecompositionResult:
|
||||
"""Decompose a framework-container obligation into concrete sub-obligations.
|
||||
|
||||
Steps:
|
||||
1. Resolve framework from registry
|
||||
2. Resolve domain within framework
|
||||
3. Select relevant subcontrols (keyword filter or full domain)
|
||||
4. Generate decomposed obligations
|
||||
"""
|
||||
container_id = f"FWC-{uuid.uuid4().hex[:8]}"
|
||||
registry = get_registry()
|
||||
issues: list[str] = []
|
||||
|
||||
# Step 1: Resolve framework
|
||||
fw = None
|
||||
if framework_ref and framework_ref in registry:
|
||||
fw = registry[framework_ref]
|
||||
else:
|
||||
# Try to find by name in text
|
||||
fw, framework_ref = _find_framework_in_text(obligation_text, registry)
|
||||
|
||||
if not fw:
|
||||
issues.append("ERROR: framework_not_matched")
|
||||
return FrameworkDecompositionResult(
|
||||
framework_container_id=container_id,
|
||||
source_obligation_candidate_id=obligation_candidate_id,
|
||||
framework_ref=framework_ref,
|
||||
framework_domain=framework_domain,
|
||||
domain_title=None,
|
||||
matched_subcontrols=[],
|
||||
decomposition_confidence=0.0,
|
||||
release_state="unmatched",
|
||||
decomposed_obligations=[],
|
||||
issues=issues,
|
||||
)
|
||||
|
||||
# Step 2: Resolve domain
|
||||
domain_data = None
|
||||
domain_title = None
|
||||
if framework_domain:
|
||||
for d in fw.get("domains", []):
|
||||
if d["domain_id"].lower() == framework_domain.lower():
|
||||
domain_data = d
|
||||
domain_title = d.get("title")
|
||||
break
|
||||
if not domain_data:
|
||||
# Try matching from text
|
||||
domain_id, domain_title = _match_domain(obligation_text, fw)
|
||||
if domain_id:
|
||||
for d in fw.get("domains", []):
|
||||
if d["domain_id"] == domain_id:
|
||||
domain_data = d
|
||||
framework_domain = domain_id
|
||||
break
|
||||
|
||||
if not domain_data:
|
||||
issues.append("WARN: domain_not_matched — using all domains")
|
||||
# Fall back to all subcontrols across all domains
|
||||
all_subcontrols = []
|
||||
for d in fw.get("domains", []):
|
||||
for sc in d.get("subcontrols", []):
|
||||
sc["_domain_id"] = d["domain_id"]
|
||||
all_subcontrols.append(sc)
|
||||
subcontrols = _select_subcontrols(obligation_text, all_subcontrols)
|
||||
if not subcontrols:
|
||||
issues.append("ERROR: no_subcontrols_matched")
|
||||
return FrameworkDecompositionResult(
|
||||
framework_container_id=container_id,
|
||||
source_obligation_candidate_id=obligation_candidate_id,
|
||||
framework_ref=framework_ref,
|
||||
framework_domain=framework_domain,
|
||||
domain_title=None,
|
||||
matched_subcontrols=[],
|
||||
decomposition_confidence=0.0,
|
||||
release_state="unmatched",
|
||||
decomposed_obligations=[],
|
||||
issues=issues,
|
||||
)
|
||||
else:
|
||||
# Step 3: Select subcontrols from domain
|
||||
raw_subcontrols = domain_data.get("subcontrols", [])
|
||||
subcontrols = _select_subcontrols(obligation_text, raw_subcontrols)
|
||||
if not subcontrols:
|
||||
# Full domain decomposition
|
||||
subcontrols = raw_subcontrols
|
||||
|
||||
# Quality check: too many subcontrols
|
||||
if len(subcontrols) > 25:
|
||||
issues.append(f"WARN: {len(subcontrols)} subcontrols — may be too broad")
|
||||
|
||||
# Step 4: Generate decomposed obligations
|
||||
display_name = fw.get("display_name", framework_ref or "Unknown")
|
||||
decomposed: list[DecomposedObligation] = []
|
||||
matched_ids: list[str] = []
|
||||
|
||||
for sc in subcontrols:
|
||||
sc_id = sc.get("subcontrol_id", "")
|
||||
matched_ids.append(sc_id)
|
||||
|
||||
action_hint = sc.get("action_hint", "")
|
||||
object_hint = sc.get("object_hint", "")
|
||||
|
||||
# Quality warnings
|
||||
if not action_hint:
|
||||
issues.append(f"WARN: {sc_id} missing action_hint")
|
||||
if not object_hint:
|
||||
issues.append(f"WARN: {sc_id} missing object_hint")
|
||||
|
||||
obl_id = f"{obligation_candidate_id}-{sc_id}"
|
||||
|
||||
decomposed.append(DecomposedObligation(
|
||||
obligation_candidate_id=obl_id,
|
||||
parent_control_id=parent_control_id,
|
||||
parent_framework_container_id=container_id,
|
||||
source_ref_law=display_name,
|
||||
source_ref_article=sc_id,
|
||||
obligation_text=sc.get("statement", ""),
|
||||
actor=actor,
|
||||
action_raw=action_hint or _infer_action(sc.get("statement", "")),
|
||||
object_raw=object_hint or _infer_object(sc.get("statement", "")),
|
||||
routing_type="atomic",
|
||||
release_state="decomposed",
|
||||
subcontrol_id=sc_id,
|
||||
action_hint=action_hint,
|
||||
object_hint=object_hint,
|
||||
object_class=sc.get("object_class", ""),
|
||||
keywords=sc.get("keywords", []),
|
||||
))
|
||||
|
||||
# Check if decomposed are identical to container
|
||||
for d in decomposed:
|
||||
if d.obligation_text.strip() == obligation_text.strip():
|
||||
issues.append(f"WARN: {d.subcontrol_id} identical to container text")
|
||||
|
||||
confidence = _compute_decomposition_confidence(
|
||||
framework_ref, framework_domain, domain_data, len(subcontrols), issues,
|
||||
)
|
||||
|
||||
return FrameworkDecompositionResult(
|
||||
framework_container_id=container_id,
|
||||
source_obligation_candidate_id=obligation_candidate_id,
|
||||
framework_ref=framework_ref,
|
||||
framework_domain=framework_domain,
|
||||
domain_title=domain_title,
|
||||
matched_subcontrols=matched_ids,
|
||||
decomposition_confidence=confidence,
|
||||
release_state="decomposed",
|
||||
decomposed_obligations=decomposed,
|
||||
issues=issues,
|
||||
)
|
||||
|
||||
|
||||
def _find_framework_in_text(
|
||||
text: str, registry: dict[str, dict],
|
||||
) -> tuple[Optional[dict], Optional[str]]:
|
||||
"""Try to find a framework by searching text for known names."""
|
||||
alias_idx = _build_alias_index(registry)
|
||||
m = _DIRECT_FRAMEWORK_RE.search(text)
|
||||
if m:
|
||||
fw_id = _resolve_framework_id(m.group(0), alias_idx, registry)
|
||||
if fw_id and fw_id in registry:
|
||||
return registry[fw_id], fw_id
|
||||
return None, None
|
||||
|
||||
|
||||
def _select_subcontrols(
|
||||
obligation_text: str, subcontrols: list[dict],
|
||||
) -> list[dict]:
|
||||
"""Select relevant subcontrols based on keyword matching.
|
||||
|
||||
Returns empty list if no targeted match found (caller falls back to
|
||||
full domain).
|
||||
"""
|
||||
text_lower = obligation_text.lower()
|
||||
scored: list[tuple[int, dict]] = []
|
||||
|
||||
for sc in subcontrols:
|
||||
score = 0
|
||||
for kw in sc.get("keywords", []):
|
||||
if kw.lower() in text_lower:
|
||||
score += 1
|
||||
# Title match
|
||||
title = sc.get("title", "").lower()
|
||||
if title and title in text_lower:
|
||||
score += 3
|
||||
# Object hint in text
|
||||
obj = sc.get("object_hint", "").lower()
|
||||
if obj and obj in text_lower:
|
||||
score += 2
|
||||
|
||||
if score > 0:
|
||||
scored.append((score, sc))
|
||||
|
||||
if not scored:
|
||||
return []
|
||||
|
||||
# Only return those with meaningful overlap (score >= 2)
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [sc for score, sc in scored if score >= 2]
|
||||
|
||||
|
||||
def _infer_action(statement: str) -> str:
|
||||
"""Infer a basic action verb from a statement."""
|
||||
s = statement.lower()
|
||||
if any(w in s for w in ["definiert", "definieren", "define"]):
|
||||
return "definieren"
|
||||
if any(w in s for w in ["implementiert", "implementieren", "implement"]):
|
||||
return "implementieren"
|
||||
if any(w in s for w in ["dokumentiert", "dokumentieren", "document"]):
|
||||
return "dokumentieren"
|
||||
if any(w in s for w in ["ueberwacht", "ueberwachen", "monitor"]):
|
||||
return "ueberwachen"
|
||||
if any(w in s for w in ["getestet", "testen", "test"]):
|
||||
return "testen"
|
||||
if any(w in s for w in ["geschuetzt", "schuetzen", "protect"]):
|
||||
return "implementieren"
|
||||
if any(w in s for w in ["verwaltet", "verwalten", "manage"]):
|
||||
return "pflegen"
|
||||
if any(w in s for w in ["gemeldet", "melden", "report"]):
|
||||
return "melden"
|
||||
return "implementieren"
|
||||
|
||||
|
||||
def _infer_object(statement: str) -> str:
|
||||
"""Infer the primary object from a statement (first noun phrase)."""
|
||||
# Simple heuristic: take the text after "muessen"/"muss" up to the verb
|
||||
m = re.search(
|
||||
r"(?:muessen|muss|m(?:ü|ue)ssen)\s+(.+?)(?:\s+werden|\s+sein|\.|,|$)",
|
||||
statement,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
return m.group(1).strip()[:80]
|
||||
# Fallback: first 80 chars
|
||||
return statement[:80] if statement else ""
|
||||
|
||||
|
||||
def _compute_decomposition_confidence(
|
||||
framework_ref: Optional[str],
|
||||
domain: Optional[str],
|
||||
domain_data: Optional[dict],
|
||||
num_subcontrols: int,
|
||||
issues: list[str],
|
||||
) -> float:
|
||||
"""Compute confidence score for the decomposition."""
|
||||
score = 0.3
|
||||
if framework_ref:
|
||||
score += 0.25
|
||||
if domain:
|
||||
score += 0.20
|
||||
if domain_data:
|
||||
score += 0.10
|
||||
if 1 <= num_subcontrols <= 15:
|
||||
score += 0.10
|
||||
elif num_subcontrols > 15:
|
||||
score += 0.05 # less confident with too many
|
||||
|
||||
# Penalize errors
|
||||
errors = sum(1 for i in issues if i.startswith("ERROR:"))
|
||||
score -= errors * 0.15
|
||||
return round(max(min(score, 1.0), 0.0), 2)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry statistics (for admin/debugging)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def registry_stats() -> dict:
|
||||
"""Return summary statistics about the loaded registry."""
|
||||
reg = get_registry()
|
||||
stats = {
|
||||
"frameworks": len(reg),
|
||||
"details": [],
|
||||
}
|
||||
total_domains = 0
|
||||
total_subcontrols = 0
|
||||
for fw_id, fw in reg.items():
|
||||
domains = fw.get("domains", [])
|
||||
n_sc = sum(len(d.get("subcontrols", [])) for d in domains)
|
||||
total_domains += len(domains)
|
||||
total_subcontrols += n_sc
|
||||
stats["details"].append({
|
||||
"framework_id": fw_id,
|
||||
"display_name": fw.get("display_name", ""),
|
||||
"domains": len(domains),
|
||||
"subcontrols": n_sc,
|
||||
})
|
||||
stats["total_domains"] = total_domains
|
||||
stats["total_subcontrols"] = total_subcontrols
|
||||
return stats
|
||||
@@ -173,6 +173,7 @@ class LLMProviderType(str, Enum):
|
||||
"""Supported LLM provider types."""
|
||||
ANTHROPIC = "anthropic"
|
||||
SELF_HOSTED = "self_hosted"
|
||||
OLLAMA = "ollama" # Alias for self_hosted (Ollama-specific)
|
||||
MOCK = "mock" # For testing
|
||||
|
||||
|
||||
@@ -392,6 +393,7 @@ class SelfHostedProvider(LLMProvider):
|
||||
"model": self.model,
|
||||
"prompt": full_prompt,
|
||||
"stream": False,
|
||||
"think": False, # Disable thinking mode (qwen3.5 etc.)
|
||||
"options": {}
|
||||
}
|
||||
|
||||
@@ -549,7 +551,7 @@ def get_llm_config() -> LLMConfig:
|
||||
vault_path="breakpilot/api_keys/anthropic",
|
||||
env_var="ANTHROPIC_API_KEY"
|
||||
)
|
||||
elif provider_type == LLMProviderType.SELF_HOSTED:
|
||||
elif provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
|
||||
api_key = get_secret_from_vault_or_env(
|
||||
vault_path="breakpilot/api_keys/self_hosted_llm",
|
||||
env_var="SELF_HOSTED_LLM_KEY"
|
||||
@@ -558,7 +560,7 @@ def get_llm_config() -> LLMConfig:
|
||||
# Select model based on provider type
|
||||
if provider_type == LLMProviderType.ANTHROPIC:
|
||||
model = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514")
|
||||
elif provider_type == LLMProviderType.SELF_HOSTED:
|
||||
elif provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
|
||||
model = os.getenv("SELF_HOSTED_LLM_MODEL", "qwen2.5:14b")
|
||||
else:
|
||||
model = "mock-model"
|
||||
@@ -591,7 +593,7 @@ def get_llm_provider(config: Optional[LLMConfig] = None) -> LLMProvider:
|
||||
return MockProvider(config)
|
||||
return AnthropicProvider(config)
|
||||
|
||||
elif config.provider_type == LLMProviderType.SELF_HOSTED:
|
||||
elif config.provider_type in (LLMProviderType.SELF_HOSTED, LLMProviderType.OLLAMA):
|
||||
if not config.base_url:
|
||||
logger.warning("No self-hosted LLM URL found, using mock provider")
|
||||
return MockProvider(config)
|
||||
|
||||
59
backend-compliance/compliance/services/normative_patterns.py
Normal file
59
backend-compliance/compliance/services/normative_patterns.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Shared normative language patterns for assertion classification.
|
||||
|
||||
Extracted from decomposition_pass.py for reuse in the assertion engine.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
_PFLICHT_SIGNALS = [
|
||||
r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b",
|
||||
r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b",
|
||||
r"\bist\s+verpflichtet\b",
|
||||
r"\bist\s+zu\s+\w+en\b", r"\bsind\s+zu\s+\w+en\b",
|
||||
r"\bhat\s+zu\s+\w+en\b", r"\bhaben\s+zu\s+\w+en\b",
|
||||
r"\bist\s+\w+zu\w+en\b", r"\bsind\s+\w+zu\w+en\b",
|
||||
r"\bist\s+\w+\s+zu\s+\w+en\b", r"\bsind\s+\w+\s+zu\s+\w+en\b",
|
||||
r"\bhat\s+\w+\s+zu\s+\w+en\b", r"\bhaben\s+\w+\s+zu\s+\w+en\b",
|
||||
r"\bshall\b", r"\bmust\b", r"\brequired\b",
|
||||
r"\b\w+zuteilen\b", r"\b\w+zuwenden\b", r"\b\w+zustellen\b", r"\b\w+zulegen\b",
|
||||
r"\b\w+zunehmen\b", r"\b\w+zuführen\b", r"\b\w+zuhalten\b", r"\b\w+zusetzen\b",
|
||||
r"\b\w+zuweisen\b", r"\b\w+zuordnen\b", r"\b\w+zufügen\b", r"\b\w+zugeben\b",
|
||||
r"\bist\b.{1,80}\bzu\s+\w+en\b", r"\bsind\b.{1,80}\bzu\s+\w+en\b",
|
||||
]
|
||||
PFLICHT_RE = re.compile("|".join(_PFLICHT_SIGNALS), re.IGNORECASE)
|
||||
|
||||
_EMPFEHLUNG_SIGNALS = [
|
||||
r"\bsoll\b", r"\bsollen\b", r"\bsollte\b", r"\bsollten\b",
|
||||
r"\bgewährleisten\b", r"\bsicherstellen\b",
|
||||
r"\bshould\b", r"\bensure\b", r"\brecommend\w*\b",
|
||||
r"\bnachweisen\b", r"\beinhalten\b", r"\bunterlassen\b", r"\bwahren\b",
|
||||
r"\bdokumentieren\b", r"\bimplementieren\b", r"\büberprüfen\b", r"\büberwachen\b",
|
||||
r"\bprüfen,\s+ob\b", r"\bkontrollieren,\s+ob\b",
|
||||
]
|
||||
EMPFEHLUNG_RE = re.compile("|".join(_EMPFEHLUNG_SIGNALS), re.IGNORECASE)
|
||||
|
||||
_KANN_SIGNALS = [
|
||||
r"\bkann\b", r"\bkönnen\b", r"\bdarf\b", r"\bdürfen\b",
|
||||
r"\bmay\b", r"\boptional\b",
|
||||
]
|
||||
KANN_RE = re.compile("|".join(_KANN_SIGNALS), re.IGNORECASE)
|
||||
|
||||
NORMATIVE_RE = re.compile(
|
||||
"|".join(_PFLICHT_SIGNALS + _EMPFEHLUNG_SIGNALS + _KANN_SIGNALS),
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_RATIONALE_SIGNALS = [
|
||||
r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung",
|
||||
r"\bbecause\b", r"\breason\b", r"\brationale\b",
|
||||
r"\bkönnen\s+.*\s+verursachen\b", r"\bführt\s+zu\b",
|
||||
]
|
||||
RATIONALE_RE = re.compile("|".join(_RATIONALE_SIGNALS), re.IGNORECASE)
|
||||
|
||||
# Evidence-related keywords (for fact detection)
|
||||
_EVIDENCE_KEYWORDS = [
|
||||
r"\bnachweis\b", r"\bzertifikat\b", r"\baudit.report\b",
|
||||
r"\bprotokoll\b", r"\bdokumentation\b", r"\bbericht\b",
|
||||
r"\bcertificate\b", r"\bevidence\b", r"\bproof\b",
|
||||
]
|
||||
EVIDENCE_RE = re.compile("|".join(_EVIDENCE_KEYWORDS), re.IGNORECASE)
|
||||
563
backend-compliance/compliance/services/obligation_extractor.py
Normal file
563
backend-compliance/compliance/services/obligation_extractor.py
Normal file
@@ -0,0 +1,563 @@
|
||||
"""Obligation Extractor — 3-Tier Chunk-to-Obligation Linking.
|
||||
|
||||
Maps RAG chunks to obligations from the v2 obligation framework using
|
||||
three tiers (fastest first):
|
||||
|
||||
Tier 1: EXACT MATCH — regulation_code + article → obligation_id (~40%)
|
||||
Tier 2: EMBEDDING — chunk text vs. obligation descriptions (~30%)
|
||||
Tier 3: LLM EXTRACT — local Ollama extracts obligation text (~25%)
|
||||
|
||||
Part of the Multi-Layer Control Architecture (Phase 4 of 8).
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
||||
|
||||
# Embedding similarity thresholds for Tier 2
|
||||
EMBEDDING_MATCH_THRESHOLD = 0.80
|
||||
EMBEDDING_CANDIDATE_THRESHOLD = 0.60
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regulation code mapping: RAG chunk codes → obligation file regulation IDs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_REGULATION_CODE_TO_ID = {
|
||||
# DSGVO
|
||||
"eu_2016_679": "dsgvo",
|
||||
"dsgvo": "dsgvo",
|
||||
"gdpr": "dsgvo",
|
||||
# AI Act
|
||||
"eu_2024_1689": "ai_act",
|
||||
"ai_act": "ai_act",
|
||||
"aiact": "ai_act",
|
||||
# NIS2
|
||||
"eu_2022_2555": "nis2",
|
||||
"nis2": "nis2",
|
||||
"bsig": "nis2",
|
||||
# BDSG
|
||||
"bdsg": "bdsg",
|
||||
# TTDSG
|
||||
"ttdsg": "ttdsg",
|
||||
# DSA
|
||||
"eu_2022_2065": "dsa",
|
||||
"dsa": "dsa",
|
||||
# Data Act
|
||||
"eu_2023_2854": "data_act",
|
||||
"data_act": "data_act",
|
||||
# EU Machinery
|
||||
"eu_2023_1230": "eu_machinery",
|
||||
"eu_machinery": "eu_machinery",
|
||||
# DORA
|
||||
"eu_2022_2554": "dora",
|
||||
"dora": "dora",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ObligationMatch:
|
||||
"""Result of obligation extraction."""
|
||||
|
||||
obligation_id: Optional[str] = None
|
||||
obligation_title: Optional[str] = None
|
||||
obligation_text: Optional[str] = None
|
||||
method: str = "none" # exact_match | embedding_match | llm_extracted | inferred
|
||||
confidence: float = 0.0
|
||||
regulation_id: Optional[str] = None # e.g. "dsgvo"
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"obligation_id": self.obligation_id,
|
||||
"obligation_title": self.obligation_title,
|
||||
"obligation_text": self.obligation_text,
|
||||
"method": self.method,
|
||||
"confidence": self.confidence,
|
||||
"regulation_id": self.regulation_id,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class _ObligationEntry:
|
||||
"""Internal representation of a loaded obligation."""
|
||||
|
||||
id: str
|
||||
title: str
|
||||
description: str
|
||||
regulation_id: str
|
||||
articles: list[str] = field(default_factory=list) # normalized: ["art. 30", "§ 38"]
|
||||
embedding: list[float] = field(default_factory=list)
|
||||
|
||||
|
||||
class ObligationExtractor:
|
||||
"""3-Tier obligation extraction from RAG chunks.
|
||||
|
||||
Usage::
|
||||
|
||||
extractor = ObligationExtractor()
|
||||
await extractor.initialize() # loads obligations + embeddings
|
||||
|
||||
match = await extractor.extract(
|
||||
chunk_text="...",
|
||||
regulation_code="eu_2016_679",
|
||||
article="Art. 30",
|
||||
paragraph="Abs. 1",
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._article_lookup: dict[str, list[str]] = {} # "dsgvo/art. 30" → ["DSGVO-OBL-001"]
|
||||
self._obligations: dict[str, _ObligationEntry] = {} # id → entry
|
||||
self._obligation_embeddings: list[list[float]] = []
|
||||
self._obligation_ids: list[str] = []
|
||||
self._initialized = False
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Load all obligations from v2 JSON files and compute embeddings."""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self._load_obligations()
|
||||
await self._compute_embeddings()
|
||||
self._initialized = True
|
||||
logger.info(
|
||||
"ObligationExtractor initialized: %d obligations, %d article lookups, %d embeddings",
|
||||
len(self._obligations),
|
||||
len(self._article_lookup),
|
||||
sum(1 for e in self._obligation_embeddings if e),
|
||||
)
|
||||
|
||||
async def extract(
|
||||
self,
|
||||
chunk_text: str,
|
||||
regulation_code: str,
|
||||
article: Optional[str] = None,
|
||||
paragraph: Optional[str] = None,
|
||||
) -> ObligationMatch:
|
||||
"""Extract obligation from a chunk using 3-tier strategy."""
|
||||
if not self._initialized:
|
||||
await self.initialize()
|
||||
|
||||
reg_id = _normalize_regulation(regulation_code)
|
||||
|
||||
# Tier 1: Exact match via article lookup
|
||||
if article:
|
||||
match = self._tier1_exact(reg_id, article)
|
||||
if match:
|
||||
return match
|
||||
|
||||
# Tier 2: Embedding similarity
|
||||
match = await self._tier2_embedding(chunk_text, reg_id)
|
||||
if match:
|
||||
return match
|
||||
|
||||
# Tier 3: LLM extraction
|
||||
match = await self._tier3_llm(chunk_text, regulation_code, article)
|
||||
return match
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Tier 1: Exact Match
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def _tier1_exact(self, reg_id: Optional[str], article: str) -> Optional[ObligationMatch]:
|
||||
"""Look up obligation by regulation + article."""
|
||||
if not reg_id:
|
||||
return None
|
||||
|
||||
norm_article = _normalize_article(article)
|
||||
key = f"{reg_id}/{norm_article}"
|
||||
|
||||
obl_ids = self._article_lookup.get(key)
|
||||
if not obl_ids:
|
||||
return None
|
||||
|
||||
# Take the first match (highest priority)
|
||||
obl_id = obl_ids[0]
|
||||
entry = self._obligations.get(obl_id)
|
||||
if not entry:
|
||||
return None
|
||||
|
||||
return ObligationMatch(
|
||||
obligation_id=entry.id,
|
||||
obligation_title=entry.title,
|
||||
obligation_text=entry.description,
|
||||
method="exact_match",
|
||||
confidence=1.0,
|
||||
regulation_id=reg_id,
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Tier 2: Embedding Match
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
async def _tier2_embedding(
|
||||
self, chunk_text: str, reg_id: Optional[str]
|
||||
) -> Optional[ObligationMatch]:
|
||||
"""Find nearest obligation by embedding similarity."""
|
||||
if not self._obligation_embeddings:
|
||||
return None
|
||||
|
||||
chunk_embedding = await _get_embedding(chunk_text[:2000])
|
||||
if not chunk_embedding:
|
||||
return None
|
||||
|
||||
best_idx = -1
|
||||
best_score = 0.0
|
||||
|
||||
for i, obl_emb in enumerate(self._obligation_embeddings):
|
||||
if not obl_emb:
|
||||
continue
|
||||
# Prefer same-regulation matches
|
||||
obl_id = self._obligation_ids[i]
|
||||
entry = self._obligations.get(obl_id)
|
||||
score = _cosine_sim(chunk_embedding, obl_emb)
|
||||
|
||||
# Domain bonus: +0.05 if same regulation
|
||||
if entry and reg_id and entry.regulation_id == reg_id:
|
||||
score += 0.05
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_idx = i
|
||||
|
||||
if best_idx < 0:
|
||||
return None
|
||||
|
||||
# Remove domain bonus for threshold comparison
|
||||
raw_score = best_score
|
||||
obl_id = self._obligation_ids[best_idx]
|
||||
entry = self._obligations.get(obl_id)
|
||||
if entry and reg_id and entry.regulation_id == reg_id:
|
||||
raw_score -= 0.05
|
||||
|
||||
if raw_score >= EMBEDDING_MATCH_THRESHOLD:
|
||||
return ObligationMatch(
|
||||
obligation_id=entry.id if entry else obl_id,
|
||||
obligation_title=entry.title if entry else None,
|
||||
obligation_text=entry.description if entry else None,
|
||||
method="embedding_match",
|
||||
confidence=round(min(raw_score, 1.0), 3),
|
||||
regulation_id=entry.regulation_id if entry else reg_id,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Tier 3: LLM Extraction
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
async def _tier3_llm(
|
||||
self, chunk_text: str, regulation_code: str, article: Optional[str]
|
||||
) -> ObligationMatch:
|
||||
"""Use local LLM to extract the obligation from the chunk."""
|
||||
prompt = f"""Analysiere den folgenden Gesetzestext und extrahiere die zentrale rechtliche Pflicht.
|
||||
|
||||
Text:
|
||||
{chunk_text[:3000]}
|
||||
|
||||
Quelle: {regulation_code} {article or ''}
|
||||
|
||||
Antworte NUR als JSON:
|
||||
{{
|
||||
"obligation_text": "Die zentrale Pflicht in einem Satz",
|
||||
"actor": "Wer muss handeln (z.B. Verantwortlicher, Auftragsverarbeiter)",
|
||||
"action": "Was muss getan werden",
|
||||
"normative_strength": "muss|soll|kann"
|
||||
}}"""
|
||||
|
||||
system_prompt = (
|
||||
"Du bist ein Rechtsexperte fuer EU-Datenschutz- und Digitalrecht. "
|
||||
"Extrahiere die zentrale rechtliche Pflicht aus Gesetzestexten. "
|
||||
"Antworte ausschliesslich als JSON."
|
||||
)
|
||||
|
||||
result_text = await _llm_ollama(prompt, system_prompt)
|
||||
if not result_text:
|
||||
return ObligationMatch(
|
||||
method="llm_extracted",
|
||||
confidence=0.0,
|
||||
regulation_id=_normalize_regulation(regulation_code),
|
||||
)
|
||||
|
||||
parsed = _parse_json(result_text)
|
||||
obligation_text = parsed.get("obligation_text", result_text[:500])
|
||||
|
||||
return ObligationMatch(
|
||||
obligation_id=None,
|
||||
obligation_title=None,
|
||||
obligation_text=obligation_text,
|
||||
method="llm_extracted",
|
||||
confidence=0.60,
|
||||
regulation_id=_normalize_regulation(regulation_code),
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Initialization helpers
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def _load_obligations(self) -> None:
|
||||
"""Load all obligation files from v2 framework."""
|
||||
v2_dir = _find_obligations_dir()
|
||||
if not v2_dir:
|
||||
logger.warning("Obligations v2 directory not found — Tier 1 disabled")
|
||||
return
|
||||
|
||||
manifest_path = v2_dir / "_manifest.json"
|
||||
if not manifest_path.exists():
|
||||
logger.warning("Manifest not found at %s", manifest_path)
|
||||
return
|
||||
|
||||
with open(manifest_path) as f:
|
||||
manifest = json.load(f)
|
||||
|
||||
for reg_info in manifest.get("regulations", []):
|
||||
reg_id = reg_info["id"]
|
||||
reg_file = v2_dir / reg_info["file"]
|
||||
if not reg_file.exists():
|
||||
logger.warning("Regulation file not found: %s", reg_file)
|
||||
continue
|
||||
|
||||
with open(reg_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for obl in data.get("obligations", []):
|
||||
obl_id = obl["id"]
|
||||
entry = _ObligationEntry(
|
||||
id=obl_id,
|
||||
title=obl.get("title", ""),
|
||||
description=obl.get("description", ""),
|
||||
regulation_id=reg_id,
|
||||
)
|
||||
|
||||
# Build article lookup from legal_basis
|
||||
for basis in obl.get("legal_basis", []):
|
||||
article_raw = basis.get("article", "")
|
||||
if article_raw:
|
||||
norm_art = _normalize_article(article_raw)
|
||||
key = f"{reg_id}/{norm_art}"
|
||||
if key not in self._article_lookup:
|
||||
self._article_lookup[key] = []
|
||||
self._article_lookup[key].append(obl_id)
|
||||
entry.articles.append(norm_art)
|
||||
|
||||
self._obligations[obl_id] = entry
|
||||
|
||||
logger.info(
|
||||
"Loaded %d obligations from %d regulations",
|
||||
len(self._obligations),
|
||||
len(manifest.get("regulations", [])),
|
||||
)
|
||||
|
||||
async def _compute_embeddings(self) -> None:
|
||||
"""Compute embeddings for all obligation descriptions."""
|
||||
if not self._obligations:
|
||||
return
|
||||
|
||||
self._obligation_ids = list(self._obligations.keys())
|
||||
texts = [
|
||||
f"{self._obligations[oid].title}: {self._obligations[oid].description}"
|
||||
for oid in self._obligation_ids
|
||||
]
|
||||
|
||||
logger.info("Computing embeddings for %d obligations...", len(texts))
|
||||
self._obligation_embeddings = await _get_embeddings_batch(texts)
|
||||
valid = sum(1 for e in self._obligation_embeddings if e)
|
||||
logger.info("Got %d/%d valid embeddings", valid, len(texts))
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Stats
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def stats(self) -> dict:
|
||||
"""Return initialization statistics."""
|
||||
return {
|
||||
"total_obligations": len(self._obligations),
|
||||
"article_lookups": len(self._article_lookup),
|
||||
"embeddings_valid": sum(1 for e in self._obligation_embeddings if e),
|
||||
"regulations": list(
|
||||
{e.regulation_id for e in self._obligations.values()}
|
||||
),
|
||||
"initialized": self._initialized,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level helpers (reusable by other modules)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _normalize_regulation(regulation_code: str) -> Optional[str]:
|
||||
"""Map a RAG regulation_code to obligation framework regulation ID."""
|
||||
if not regulation_code:
|
||||
return None
|
||||
code = regulation_code.lower().strip()
|
||||
|
||||
# Direct lookup
|
||||
if code in _REGULATION_CODE_TO_ID:
|
||||
return _REGULATION_CODE_TO_ID[code]
|
||||
|
||||
# Prefix matching for families
|
||||
for prefix, reg_id in [
|
||||
("eu_2016_679", "dsgvo"),
|
||||
("eu_2024_1689", "ai_act"),
|
||||
("eu_2022_2555", "nis2"),
|
||||
("eu_2022_2065", "dsa"),
|
||||
("eu_2023_2854", "data_act"),
|
||||
("eu_2023_1230", "eu_machinery"),
|
||||
("eu_2022_2554", "dora"),
|
||||
]:
|
||||
if code.startswith(prefix):
|
||||
return reg_id
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_article(article: str) -> str:
|
||||
"""Normalize article references for consistent lookup.
|
||||
|
||||
Examples:
|
||||
"Art. 30" → "art. 30"
|
||||
"§ 38 BDSG" → "§ 38"
|
||||
"Article 10" → "art. 10"
|
||||
"Art. 30 Abs. 1" → "art. 30"
|
||||
"Artikel 35" → "art. 35"
|
||||
"""
|
||||
if not article:
|
||||
return ""
|
||||
s = article.strip()
|
||||
|
||||
# Remove trailing law name: "§ 38 BDSG" → "§ 38"
|
||||
s = re.sub(r"\s+(DSGVO|BDSG|TTDSG|DSA|NIS2|DORA|AI.?Act)\s*$", "", s, flags=re.IGNORECASE)
|
||||
|
||||
# Remove paragraph references: "Art. 30 Abs. 1" → "Art. 30"
|
||||
s = re.sub(r"\s+(Abs|Absatz|para|paragraph|lit|Satz)\.?\s+.*$", "", s, flags=re.IGNORECASE)
|
||||
|
||||
# Normalize "Article" / "Artikel" → "Art."
|
||||
s = re.sub(r"^(Article|Artikel)\s+", "Art. ", s, flags=re.IGNORECASE)
|
||||
|
||||
return s.lower().strip()
|
||||
|
||||
|
||||
def _cosine_sim(a: list[float], b: list[float]) -> float:
|
||||
"""Compute cosine similarity between two vectors."""
|
||||
if not a or not b or len(a) != len(b):
|
||||
return 0.0
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
norm_a = sum(x * x for x in a) ** 0.5
|
||||
norm_b = sum(x * x for x in b) ** 0.5
|
||||
if norm_a == 0 or norm_b == 0:
|
||||
return 0.0
|
||||
return dot / (norm_a * norm_b)
|
||||
|
||||
|
||||
def _find_obligations_dir() -> Optional[Path]:
|
||||
"""Locate the obligations v2 directory."""
|
||||
candidates = [
|
||||
Path(__file__).resolve().parent.parent.parent.parent
|
||||
/ "ai-compliance-sdk" / "policies" / "obligations" / "v2",
|
||||
Path("/app/ai-compliance-sdk/policies/obligations/v2"),
|
||||
Path("ai-compliance-sdk/policies/obligations/v2"),
|
||||
]
|
||||
for p in candidates:
|
||||
if p.is_dir() and (p / "_manifest.json").exists():
|
||||
return p
|
||||
return None
|
||||
|
||||
|
||||
async def _get_embedding(text: str) -> list[float]:
|
||||
"""Get embedding vector for a single text."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
resp = await client.post(
|
||||
f"{EMBEDDING_URL}/embed",
|
||||
json={"texts": [text]},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
embeddings = resp.json().get("embeddings", [])
|
||||
return embeddings[0] if embeddings else []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
async def _get_embeddings_batch(
|
||||
texts: list[str], batch_size: int = 32
|
||||
) -> list[list[float]]:
|
||||
"""Get embeddings for multiple texts in batches."""
|
||||
all_embeddings: list[list[float]] = []
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i : i + batch_size]
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.post(
|
||||
f"{EMBEDDING_URL}/embed",
|
||||
json={"texts": batch},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
embeddings = resp.json().get("embeddings", [])
|
||||
all_embeddings.extend(embeddings)
|
||||
except Exception as e:
|
||||
logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
|
||||
all_embeddings.extend([[] for _ in batch])
|
||||
return all_embeddings
|
||||
|
||||
|
||||
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"""Call local Ollama for LLM extraction."""
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
payload = {
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 512},
|
||||
"think": False,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
||||
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
||||
if resp.status_code != 200:
|
||||
logger.error(
|
||||
"Ollama chat failed %d: %s", resp.status_code, resp.text[:300]
|
||||
)
|
||||
return ""
|
||||
data = resp.json()
|
||||
return data.get("message", {}).get("content", "")
|
||||
except Exception as e:
|
||||
logger.warning("Ollama call failed: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
def _parse_json(text: str) -> dict:
|
||||
"""Extract JSON from LLM response text."""
|
||||
# Try direct parse
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try extracting JSON block
|
||||
match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return {}
|
||||
532
backend-compliance/compliance/services/pattern_matcher.py
Normal file
532
backend-compliance/compliance/services/pattern_matcher.py
Normal file
@@ -0,0 +1,532 @@
|
||||
"""Pattern Matcher — Obligation-to-Control-Pattern Linking.
|
||||
|
||||
Maps obligations (from the ObligationExtractor) to control patterns
|
||||
using two tiers:
|
||||
|
||||
Tier 1: KEYWORD MATCH — obligation_match_keywords from patterns (~70%)
|
||||
Tier 2: EMBEDDING — cosine similarity with domain bonus (~25%)
|
||||
|
||||
Part of the Multi-Layer Control Architecture (Phase 5 of 8).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
|
||||
from compliance.services.obligation_extractor import (
|
||||
_cosine_sim,
|
||||
_get_embedding,
|
||||
_get_embeddings_batch,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Minimum keyword score to accept a match (at least 2 keyword hits)
|
||||
KEYWORD_MATCH_MIN_HITS = 2
|
||||
# Embedding threshold for Tier 2
|
||||
EMBEDDING_PATTERN_THRESHOLD = 0.75
|
||||
# Domain bonus when regulation maps to the pattern's domain
|
||||
DOMAIN_BONUS = 0.10
|
||||
|
||||
# Map regulation IDs to pattern domains that are likely relevant
|
||||
_REGULATION_DOMAIN_AFFINITY = {
|
||||
"dsgvo": ["DATA", "COMP", "GOV"],
|
||||
"bdsg": ["DATA", "COMP"],
|
||||
"ttdsg": ["DATA"],
|
||||
"ai_act": ["AI", "COMP", "DATA"],
|
||||
"nis2": ["SEC", "INC", "NET", "LOG", "CRYP"],
|
||||
"dsa": ["DATA", "COMP"],
|
||||
"data_act": ["DATA", "COMP"],
|
||||
"eu_machinery": ["SEC", "COMP"],
|
||||
"dora": ["SEC", "INC", "FIN", "COMP"],
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ControlPattern:
|
||||
"""Python representation of a control pattern from YAML."""
|
||||
|
||||
id: str
|
||||
name: str
|
||||
name_de: str
|
||||
domain: str
|
||||
category: str
|
||||
description: str
|
||||
objective_template: str
|
||||
rationale_template: str
|
||||
requirements_template: list[str] = field(default_factory=list)
|
||||
test_procedure_template: list[str] = field(default_factory=list)
|
||||
evidence_template: list[str] = field(default_factory=list)
|
||||
severity_default: str = "medium"
|
||||
implementation_effort_default: str = "m"
|
||||
obligation_match_keywords: list[str] = field(default_factory=list)
|
||||
tags: list[str] = field(default_factory=list)
|
||||
composable_with: list[str] = field(default_factory=list)
|
||||
open_anchor_refs: list[dict] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PatternMatchResult:
|
||||
"""Result of pattern matching."""
|
||||
|
||||
pattern: Optional[ControlPattern] = None
|
||||
pattern_id: Optional[str] = None
|
||||
method: str = "none" # keyword | embedding | combined | none
|
||||
confidence: float = 0.0
|
||||
keyword_hits: int = 0
|
||||
total_keywords: int = 0
|
||||
embedding_score: float = 0.0
|
||||
domain_bonus_applied: bool = False
|
||||
composable_patterns: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"pattern_id": self.pattern_id,
|
||||
"method": self.method,
|
||||
"confidence": round(self.confidence, 3),
|
||||
"keyword_hits": self.keyword_hits,
|
||||
"total_keywords": self.total_keywords,
|
||||
"embedding_score": round(self.embedding_score, 3),
|
||||
"domain_bonus_applied": self.domain_bonus_applied,
|
||||
"composable_patterns": self.composable_patterns,
|
||||
}
|
||||
|
||||
|
||||
class PatternMatcher:
|
||||
"""Links obligations to control patterns using keyword + embedding matching.
|
||||
|
||||
Usage::
|
||||
|
||||
matcher = PatternMatcher()
|
||||
await matcher.initialize()
|
||||
|
||||
result = await matcher.match(
|
||||
obligation_text="Fuehrung eines Verarbeitungsverzeichnisses...",
|
||||
regulation_id="dsgvo",
|
||||
)
|
||||
print(result.pattern_id) # e.g. "CP-COMP-001"
|
||||
print(result.confidence) # e.g. 0.85
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._patterns: list[ControlPattern] = []
|
||||
self._by_id: dict[str, ControlPattern] = {}
|
||||
self._by_domain: dict[str, list[ControlPattern]] = {}
|
||||
self._keyword_index: dict[str, list[str]] = {} # keyword → [pattern_ids]
|
||||
self._pattern_embeddings: list[list[float]] = []
|
||||
self._pattern_ids: list[str] = []
|
||||
self._initialized = False
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Load patterns from YAML and compute embeddings."""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self._load_patterns()
|
||||
self._build_keyword_index()
|
||||
await self._compute_embeddings()
|
||||
self._initialized = True
|
||||
logger.info(
|
||||
"PatternMatcher initialized: %d patterns, %d keywords, %d embeddings",
|
||||
len(self._patterns),
|
||||
len(self._keyword_index),
|
||||
sum(1 for e in self._pattern_embeddings if e),
|
||||
)
|
||||
|
||||
async def match(
|
||||
self,
|
||||
obligation_text: str,
|
||||
regulation_id: Optional[str] = None,
|
||||
top_n: int = 1,
|
||||
) -> PatternMatchResult:
|
||||
"""Match obligation text to the best control pattern.
|
||||
|
||||
Args:
|
||||
obligation_text: The obligation description to match against.
|
||||
regulation_id: Source regulation (for domain bonus).
|
||||
top_n: Number of top results to consider for composability.
|
||||
|
||||
Returns:
|
||||
PatternMatchResult with the best match.
|
||||
"""
|
||||
if not self._initialized:
|
||||
await self.initialize()
|
||||
|
||||
if not obligation_text or not self._patterns:
|
||||
return PatternMatchResult()
|
||||
|
||||
# Tier 1: Keyword matching
|
||||
keyword_result = self._tier1_keyword(obligation_text, regulation_id)
|
||||
|
||||
# Tier 2: Embedding matching
|
||||
embedding_result = await self._tier2_embedding(obligation_text, regulation_id)
|
||||
|
||||
# Combine scores: prefer keyword match, boost with embedding if available
|
||||
best = self._combine_results(keyword_result, embedding_result)
|
||||
|
||||
# Attach composable patterns
|
||||
if best.pattern:
|
||||
best.composable_patterns = [
|
||||
pid for pid in best.pattern.composable_with
|
||||
if pid in self._by_id
|
||||
]
|
||||
|
||||
return best
|
||||
|
||||
async def match_top_n(
|
||||
self,
|
||||
obligation_text: str,
|
||||
regulation_id: Optional[str] = None,
|
||||
n: int = 3,
|
||||
) -> list[PatternMatchResult]:
|
||||
"""Return top-N pattern matches sorted by confidence descending."""
|
||||
if not self._initialized:
|
||||
await self.initialize()
|
||||
|
||||
if not obligation_text or not self._patterns:
|
||||
return []
|
||||
|
||||
keyword_scores = self._keyword_scores(obligation_text, regulation_id)
|
||||
embedding_scores = await self._embedding_scores(obligation_text, regulation_id)
|
||||
|
||||
# Merge scores
|
||||
all_pattern_ids = set(keyword_scores.keys()) | set(embedding_scores.keys())
|
||||
results: list[PatternMatchResult] = []
|
||||
|
||||
for pid in all_pattern_ids:
|
||||
pattern = self._by_id.get(pid)
|
||||
if not pattern:
|
||||
continue
|
||||
|
||||
kw_score = keyword_scores.get(pid, (0, 0, 0.0)) # (hits, total, score)
|
||||
emb_score = embedding_scores.get(pid, (0.0, False)) # (score, bonus_applied)
|
||||
|
||||
kw_hits, kw_total, kw_confidence = kw_score
|
||||
emb_confidence, bonus_applied = emb_score
|
||||
|
||||
# Combined confidence: max of keyword and embedding, with boost if both
|
||||
if kw_confidence > 0 and emb_confidence > 0:
|
||||
combined = max(kw_confidence, emb_confidence) + 0.05
|
||||
method = "combined"
|
||||
elif kw_confidence > 0:
|
||||
combined = kw_confidence
|
||||
method = "keyword"
|
||||
else:
|
||||
combined = emb_confidence
|
||||
method = "embedding"
|
||||
|
||||
results.append(PatternMatchResult(
|
||||
pattern=pattern,
|
||||
pattern_id=pid,
|
||||
method=method,
|
||||
confidence=min(combined, 1.0),
|
||||
keyword_hits=kw_hits,
|
||||
total_keywords=kw_total,
|
||||
embedding_score=emb_confidence,
|
||||
domain_bonus_applied=bonus_applied,
|
||||
composable_patterns=[
|
||||
p for p in pattern.composable_with if p in self._by_id
|
||||
],
|
||||
))
|
||||
|
||||
# Sort by confidence descending
|
||||
results.sort(key=lambda r: r.confidence, reverse=True)
|
||||
return results[:n]
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Tier 1: Keyword Match
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def _tier1_keyword(
|
||||
self, obligation_text: str, regulation_id: Optional[str]
|
||||
) -> Optional[PatternMatchResult]:
|
||||
"""Match by counting keyword hits in the obligation text."""
|
||||
scores = self._keyword_scores(obligation_text, regulation_id)
|
||||
if not scores:
|
||||
return None
|
||||
|
||||
# Find best match
|
||||
best_pid = max(scores, key=lambda pid: scores[pid][2])
|
||||
hits, total, confidence = scores[best_pid]
|
||||
|
||||
if hits < KEYWORD_MATCH_MIN_HITS:
|
||||
return None
|
||||
|
||||
pattern = self._by_id.get(best_pid)
|
||||
if not pattern:
|
||||
return None
|
||||
|
||||
# Check domain bonus
|
||||
bonus_applied = False
|
||||
if regulation_id and self._domain_matches(pattern.domain, regulation_id):
|
||||
confidence = min(confidence + DOMAIN_BONUS, 1.0)
|
||||
bonus_applied = True
|
||||
|
||||
return PatternMatchResult(
|
||||
pattern=pattern,
|
||||
pattern_id=best_pid,
|
||||
method="keyword",
|
||||
confidence=confidence,
|
||||
keyword_hits=hits,
|
||||
total_keywords=total,
|
||||
domain_bonus_applied=bonus_applied,
|
||||
)
|
||||
|
||||
def _keyword_scores(
|
||||
self, text: str, regulation_id: Optional[str]
|
||||
) -> dict[str, tuple[int, int, float]]:
|
||||
"""Compute keyword match scores for all patterns.
|
||||
|
||||
Returns dict: pattern_id → (hits, total_keywords, confidence).
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
hits_by_pattern: dict[str, int] = {}
|
||||
|
||||
for keyword, pattern_ids in self._keyword_index.items():
|
||||
if keyword in text_lower:
|
||||
for pid in pattern_ids:
|
||||
hits_by_pattern[pid] = hits_by_pattern.get(pid, 0) + 1
|
||||
|
||||
result: dict[str, tuple[int, int, float]] = {}
|
||||
for pid, hits in hits_by_pattern.items():
|
||||
pattern = self._by_id.get(pid)
|
||||
if not pattern:
|
||||
continue
|
||||
total = len(pattern.obligation_match_keywords)
|
||||
confidence = hits / total if total > 0 else 0.0
|
||||
result[pid] = (hits, total, confidence)
|
||||
|
||||
return result
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Tier 2: Embedding Match
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
async def _tier2_embedding(
|
||||
self, obligation_text: str, regulation_id: Optional[str]
|
||||
) -> Optional[PatternMatchResult]:
|
||||
"""Match by embedding similarity against pattern objective_templates."""
|
||||
scores = await self._embedding_scores(obligation_text, regulation_id)
|
||||
if not scores:
|
||||
return None
|
||||
|
||||
best_pid = max(scores, key=lambda pid: scores[pid][0])
|
||||
emb_score, bonus_applied = scores[best_pid]
|
||||
|
||||
if emb_score < EMBEDDING_PATTERN_THRESHOLD:
|
||||
return None
|
||||
|
||||
pattern = self._by_id.get(best_pid)
|
||||
if not pattern:
|
||||
return None
|
||||
|
||||
return PatternMatchResult(
|
||||
pattern=pattern,
|
||||
pattern_id=best_pid,
|
||||
method="embedding",
|
||||
confidence=min(emb_score, 1.0),
|
||||
embedding_score=emb_score,
|
||||
domain_bonus_applied=bonus_applied,
|
||||
)
|
||||
|
||||
async def _embedding_scores(
|
||||
self, obligation_text: str, regulation_id: Optional[str]
|
||||
) -> dict[str, tuple[float, bool]]:
|
||||
"""Compute embedding similarity scores for all patterns.
|
||||
|
||||
Returns dict: pattern_id → (score, domain_bonus_applied).
|
||||
"""
|
||||
if not self._pattern_embeddings:
|
||||
return {}
|
||||
|
||||
chunk_embedding = await _get_embedding(obligation_text[:2000])
|
||||
if not chunk_embedding:
|
||||
return {}
|
||||
|
||||
result: dict[str, tuple[float, bool]] = {}
|
||||
for i, pat_emb in enumerate(self._pattern_embeddings):
|
||||
if not pat_emb:
|
||||
continue
|
||||
pid = self._pattern_ids[i]
|
||||
pattern = self._by_id.get(pid)
|
||||
if not pattern:
|
||||
continue
|
||||
|
||||
score = _cosine_sim(chunk_embedding, pat_emb)
|
||||
|
||||
# Domain bonus
|
||||
bonus_applied = False
|
||||
if regulation_id and self._domain_matches(pattern.domain, regulation_id):
|
||||
score += DOMAIN_BONUS
|
||||
bonus_applied = True
|
||||
|
||||
result[pid] = (score, bonus_applied)
|
||||
|
||||
return result
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Score combination
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def _combine_results(
|
||||
self,
|
||||
keyword_result: Optional[PatternMatchResult],
|
||||
embedding_result: Optional[PatternMatchResult],
|
||||
) -> PatternMatchResult:
|
||||
"""Combine keyword and embedding results into the best match."""
|
||||
if not keyword_result and not embedding_result:
|
||||
return PatternMatchResult()
|
||||
|
||||
if not keyword_result:
|
||||
return embedding_result
|
||||
if not embedding_result:
|
||||
return keyword_result
|
||||
|
||||
# Both matched — check if they agree
|
||||
if keyword_result.pattern_id == embedding_result.pattern_id:
|
||||
# Same pattern: boost confidence
|
||||
combined_confidence = min(
|
||||
max(keyword_result.confidence, embedding_result.confidence) + 0.05,
|
||||
1.0,
|
||||
)
|
||||
return PatternMatchResult(
|
||||
pattern=keyword_result.pattern,
|
||||
pattern_id=keyword_result.pattern_id,
|
||||
method="combined",
|
||||
confidence=combined_confidence,
|
||||
keyword_hits=keyword_result.keyword_hits,
|
||||
total_keywords=keyword_result.total_keywords,
|
||||
embedding_score=embedding_result.embedding_score,
|
||||
domain_bonus_applied=(
|
||||
keyword_result.domain_bonus_applied
|
||||
or embedding_result.domain_bonus_applied
|
||||
),
|
||||
)
|
||||
|
||||
# Different patterns: pick the one with higher confidence
|
||||
if keyword_result.confidence >= embedding_result.confidence:
|
||||
return keyword_result
|
||||
return embedding_result
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Domain affinity
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _domain_matches(pattern_domain: str, regulation_id: str) -> bool:
|
||||
"""Check if a pattern's domain has affinity with a regulation."""
|
||||
affine_domains = _REGULATION_DOMAIN_AFFINITY.get(regulation_id, [])
|
||||
return pattern_domain in affine_domains
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Initialization helpers
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def _load_patterns(self) -> None:
|
||||
"""Load control patterns from YAML files."""
|
||||
patterns_dir = _find_patterns_dir()
|
||||
if not patterns_dir:
|
||||
logger.warning("Control patterns directory not found")
|
||||
return
|
||||
|
||||
for yaml_file in sorted(patterns_dir.glob("*.yaml")):
|
||||
if yaml_file.name.startswith("_"):
|
||||
continue
|
||||
try:
|
||||
with open(yaml_file) as f:
|
||||
data = yaml.safe_load(f)
|
||||
if not data or "patterns" not in data:
|
||||
continue
|
||||
for p in data["patterns"]:
|
||||
pattern = ControlPattern(
|
||||
id=p["id"],
|
||||
name=p["name"],
|
||||
name_de=p["name_de"],
|
||||
domain=p["domain"],
|
||||
category=p["category"],
|
||||
description=p["description"],
|
||||
objective_template=p["objective_template"],
|
||||
rationale_template=p["rationale_template"],
|
||||
requirements_template=p.get("requirements_template", []),
|
||||
test_procedure_template=p.get("test_procedure_template", []),
|
||||
evidence_template=p.get("evidence_template", []),
|
||||
severity_default=p.get("severity_default", "medium"),
|
||||
implementation_effort_default=p.get("implementation_effort_default", "m"),
|
||||
obligation_match_keywords=p.get("obligation_match_keywords", []),
|
||||
tags=p.get("tags", []),
|
||||
composable_with=p.get("composable_with", []),
|
||||
open_anchor_refs=p.get("open_anchor_refs", []),
|
||||
)
|
||||
self._patterns.append(pattern)
|
||||
self._by_id[pattern.id] = pattern
|
||||
domain_list = self._by_domain.setdefault(pattern.domain, [])
|
||||
domain_list.append(pattern)
|
||||
except Exception as e:
|
||||
logger.error("Failed to load %s: %s", yaml_file.name, e)
|
||||
|
||||
logger.info("Loaded %d patterns from %s", len(self._patterns), patterns_dir)
|
||||
|
||||
def _build_keyword_index(self) -> None:
|
||||
"""Build reverse index: keyword → [pattern_ids]."""
|
||||
for pattern in self._patterns:
|
||||
for kw in pattern.obligation_match_keywords:
|
||||
lower_kw = kw.lower()
|
||||
if lower_kw not in self._keyword_index:
|
||||
self._keyword_index[lower_kw] = []
|
||||
self._keyword_index[lower_kw].append(pattern.id)
|
||||
|
||||
async def _compute_embeddings(self) -> None:
|
||||
"""Compute embeddings for all pattern objective templates."""
|
||||
if not self._patterns:
|
||||
return
|
||||
|
||||
self._pattern_ids = [p.id for p in self._patterns]
|
||||
texts = [
|
||||
f"{p.name_de}: {p.objective_template}"
|
||||
for p in self._patterns
|
||||
]
|
||||
|
||||
logger.info("Computing embeddings for %d patterns...", len(texts))
|
||||
self._pattern_embeddings = await _get_embeddings_batch(texts)
|
||||
valid = sum(1 for e in self._pattern_embeddings if e)
|
||||
logger.info("Got %d/%d valid pattern embeddings", valid, len(texts))
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Public helpers
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def get_pattern(self, pattern_id: str) -> Optional[ControlPattern]:
|
||||
"""Get a pattern by its ID."""
|
||||
return self._by_id.get(pattern_id.upper())
|
||||
|
||||
def get_patterns_by_domain(self, domain: str) -> list[ControlPattern]:
|
||||
"""Get all patterns for a domain."""
|
||||
return self._by_domain.get(domain.upper(), [])
|
||||
|
||||
def stats(self) -> dict:
|
||||
"""Return matcher statistics."""
|
||||
return {
|
||||
"total_patterns": len(self._patterns),
|
||||
"domains": list(self._by_domain.keys()),
|
||||
"keywords": len(self._keyword_index),
|
||||
"embeddings_valid": sum(1 for e in self._pattern_embeddings if e),
|
||||
"initialized": self._initialized,
|
||||
}
|
||||
|
||||
|
||||
def _find_patterns_dir() -> Optional[Path]:
|
||||
"""Locate the control_patterns directory."""
|
||||
candidates = [
|
||||
Path(__file__).resolve().parent.parent.parent.parent
|
||||
/ "ai-compliance-sdk" / "policies" / "control_patterns",
|
||||
Path("/app/ai-compliance-sdk/policies/control_patterns"),
|
||||
Path("ai-compliance-sdk/policies/control_patterns"),
|
||||
]
|
||||
for p in candidates:
|
||||
if p.is_dir():
|
||||
return p
|
||||
return None
|
||||
670
backend-compliance/compliance/services/pipeline_adapter.py
Normal file
670
backend-compliance/compliance/services/pipeline_adapter.py
Normal file
@@ -0,0 +1,670 @@
|
||||
"""Pipeline Adapter — New 10-Stage Pipeline Integration.
|
||||
|
||||
Bridges the existing 7-stage control_generator pipeline with the new
|
||||
multi-layer components (ObligationExtractor, PatternMatcher, ControlComposer).
|
||||
|
||||
New pipeline flow:
|
||||
chunk → license_classify
|
||||
→ obligation_extract (Stage 4 — NEW)
|
||||
→ pattern_match (Stage 5 — NEW)
|
||||
→ control_compose (Stage 6 — replaces old Stage 3)
|
||||
→ harmonize → anchor → store + crosswalk → mark processed
|
||||
|
||||
Can be used in two modes:
|
||||
1. INLINE: Called from _process_batch() to enrich the pipeline
|
||||
2. STANDALONE: Process chunks directly through new stages
|
||||
|
||||
Part of the Multi-Layer Control Architecture (Phase 7 of 8).
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from compliance.services.control_composer import ComposedControl, ControlComposer
|
||||
from compliance.services.obligation_extractor import ObligationExtractor, ObligationMatch
|
||||
from compliance.services.pattern_matcher import PatternMatcher, PatternMatchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineChunk:
|
||||
"""Input chunk for the new pipeline stages."""
|
||||
|
||||
text: str
|
||||
collection: str = ""
|
||||
regulation_code: str = ""
|
||||
article: Optional[str] = None
|
||||
paragraph: Optional[str] = None
|
||||
license_rule: int = 3
|
||||
license_info: dict = field(default_factory=dict)
|
||||
source_citation: Optional[dict] = None
|
||||
chunk_hash: str = ""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
if not self.chunk_hash:
|
||||
self.chunk_hash = hashlib.sha256(self.text.encode()).hexdigest()
|
||||
return self.chunk_hash
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineResult:
|
||||
"""Result of processing a chunk through the new pipeline."""
|
||||
|
||||
chunk: PipelineChunk
|
||||
obligation: ObligationMatch = field(default_factory=ObligationMatch)
|
||||
pattern_result: PatternMatchResult = field(default_factory=PatternMatchResult)
|
||||
control: Optional[ComposedControl] = None
|
||||
crosswalk_written: bool = False
|
||||
error: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"chunk_hash": self.chunk.chunk_hash,
|
||||
"obligation": self.obligation.to_dict() if self.obligation else None,
|
||||
"pattern": self.pattern_result.to_dict() if self.pattern_result else None,
|
||||
"control": self.control.to_dict() if self.control else None,
|
||||
"crosswalk_written": self.crosswalk_written,
|
||||
"error": self.error,
|
||||
}
|
||||
|
||||
|
||||
class PipelineAdapter:
|
||||
"""Integrates ObligationExtractor + PatternMatcher + ControlComposer.
|
||||
|
||||
Usage::
|
||||
|
||||
adapter = PipelineAdapter(db)
|
||||
await adapter.initialize()
|
||||
|
||||
result = await adapter.process_chunk(PipelineChunk(
|
||||
text="...",
|
||||
regulation_code="eu_2016_679",
|
||||
article="Art. 30",
|
||||
license_rule=1,
|
||||
))
|
||||
"""
|
||||
|
||||
def __init__(self, db: Optional[Session] = None):
|
||||
self.db = db
|
||||
self._extractor = ObligationExtractor()
|
||||
self._matcher = PatternMatcher()
|
||||
self._composer = ControlComposer()
|
||||
self._initialized = False
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize all sub-components."""
|
||||
if self._initialized:
|
||||
return
|
||||
await self._extractor.initialize()
|
||||
await self._matcher.initialize()
|
||||
self._initialized = True
|
||||
logger.info("PipelineAdapter initialized")
|
||||
|
||||
async def process_chunk(self, chunk: PipelineChunk) -> PipelineResult:
|
||||
"""Process a single chunk through the new 3-stage pipeline.
|
||||
|
||||
Stage 4: Obligation Extract
|
||||
Stage 5: Pattern Match
|
||||
Stage 6: Control Compose
|
||||
"""
|
||||
if not self._initialized:
|
||||
await self.initialize()
|
||||
|
||||
chunk.compute_hash()
|
||||
result = PipelineResult(chunk=chunk)
|
||||
|
||||
try:
|
||||
# Stage 4: Obligation Extract
|
||||
result.obligation = await self._extractor.extract(
|
||||
chunk_text=chunk.text,
|
||||
regulation_code=chunk.regulation_code,
|
||||
article=chunk.article,
|
||||
paragraph=chunk.paragraph,
|
||||
)
|
||||
|
||||
# Stage 5: Pattern Match
|
||||
obligation_text = (
|
||||
result.obligation.obligation_text
|
||||
or result.obligation.obligation_title
|
||||
or chunk.text[:500]
|
||||
)
|
||||
result.pattern_result = await self._matcher.match(
|
||||
obligation_text=obligation_text,
|
||||
regulation_id=result.obligation.regulation_id,
|
||||
)
|
||||
|
||||
# Stage 6: Control Compose
|
||||
result.control = await self._composer.compose(
|
||||
obligation=result.obligation,
|
||||
pattern_result=result.pattern_result,
|
||||
chunk_text=chunk.text if chunk.license_rule in (1, 2) else None,
|
||||
license_rule=chunk.license_rule,
|
||||
source_citation=chunk.source_citation,
|
||||
regulation_code=chunk.regulation_code,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Pipeline processing failed: %s", e)
|
||||
result.error = str(e)
|
||||
|
||||
return result
|
||||
|
||||
async def process_batch(self, chunks: list[PipelineChunk]) -> list[PipelineResult]:
|
||||
"""Process multiple chunks through the pipeline."""
|
||||
results = []
|
||||
for chunk in chunks:
|
||||
result = await self.process_chunk(chunk)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
def write_crosswalk(self, result: PipelineResult, control_uuid: str) -> bool:
|
||||
"""Write obligation_extraction + crosswalk_matrix rows for a processed chunk.
|
||||
|
||||
Called AFTER the control is stored in canonical_controls.
|
||||
"""
|
||||
if not self.db or not result.control:
|
||||
return False
|
||||
|
||||
chunk = result.chunk
|
||||
obligation = result.obligation
|
||||
pattern = result.pattern_result
|
||||
|
||||
try:
|
||||
# 1. Write obligation_extraction row
|
||||
self.db.execute(
|
||||
text("""
|
||||
INSERT INTO obligation_extractions (
|
||||
chunk_hash, collection, regulation_code,
|
||||
article, paragraph, obligation_id,
|
||||
obligation_text, confidence, extraction_method,
|
||||
pattern_id, pattern_match_score, control_uuid
|
||||
) VALUES (
|
||||
:chunk_hash, :collection, :regulation_code,
|
||||
:article, :paragraph, :obligation_id,
|
||||
:obligation_text, :confidence, :extraction_method,
|
||||
:pattern_id, :pattern_match_score,
|
||||
CAST(:control_uuid AS uuid)
|
||||
)
|
||||
"""),
|
||||
{
|
||||
"chunk_hash": chunk.chunk_hash,
|
||||
"collection": chunk.collection,
|
||||
"regulation_code": chunk.regulation_code,
|
||||
"article": chunk.article,
|
||||
"paragraph": chunk.paragraph,
|
||||
"obligation_id": obligation.obligation_id if obligation else None,
|
||||
"obligation_text": (
|
||||
obligation.obligation_text[:2000]
|
||||
if obligation and obligation.obligation_text
|
||||
else None
|
||||
),
|
||||
"confidence": obligation.confidence if obligation else 0,
|
||||
"extraction_method": obligation.method if obligation else "none",
|
||||
"pattern_id": pattern.pattern_id if pattern else None,
|
||||
"pattern_match_score": pattern.confidence if pattern else 0,
|
||||
"control_uuid": control_uuid,
|
||||
},
|
||||
)
|
||||
|
||||
# 2. Write crosswalk_matrix row
|
||||
self.db.execute(
|
||||
text("""
|
||||
INSERT INTO crosswalk_matrix (
|
||||
regulation_code, article, paragraph,
|
||||
obligation_id, pattern_id,
|
||||
master_control_id, master_control_uuid,
|
||||
confidence, source
|
||||
) VALUES (
|
||||
:regulation_code, :article, :paragraph,
|
||||
:obligation_id, :pattern_id,
|
||||
:master_control_id,
|
||||
CAST(:master_control_uuid AS uuid),
|
||||
:confidence, :source
|
||||
)
|
||||
"""),
|
||||
{
|
||||
"regulation_code": chunk.regulation_code,
|
||||
"article": chunk.article,
|
||||
"paragraph": chunk.paragraph,
|
||||
"obligation_id": obligation.obligation_id if obligation else None,
|
||||
"pattern_id": pattern.pattern_id if pattern else None,
|
||||
"master_control_id": result.control.control_id,
|
||||
"master_control_uuid": control_uuid,
|
||||
"confidence": min(
|
||||
obligation.confidence if obligation else 0,
|
||||
pattern.confidence if pattern else 0,
|
||||
),
|
||||
"source": "auto",
|
||||
},
|
||||
)
|
||||
|
||||
# 3. Update canonical_controls with pattern_id + obligation_ids
|
||||
if result.control.pattern_id or result.control.obligation_ids:
|
||||
self.db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET pattern_id = COALESCE(:pattern_id, pattern_id),
|
||||
obligation_ids = COALESCE(:obligation_ids, obligation_ids)
|
||||
WHERE id = CAST(:control_uuid AS uuid)
|
||||
"""),
|
||||
{
|
||||
"pattern_id": result.control.pattern_id,
|
||||
"obligation_ids": json.dumps(result.control.obligation_ids),
|
||||
"control_uuid": control_uuid,
|
||||
},
|
||||
)
|
||||
|
||||
self.db.commit()
|
||||
result.crosswalk_written = True
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to write crosswalk: %s", e)
|
||||
self.db.rollback()
|
||||
return False
|
||||
|
||||
def stats(self) -> dict:
|
||||
"""Return component statistics."""
|
||||
return {
|
||||
"extractor": self._extractor.stats(),
|
||||
"matcher": self._matcher.stats(),
|
||||
"initialized": self._initialized,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Migration Passes — Backfill existing 4,800+ controls
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class MigrationPasses:
|
||||
"""Non-destructive migration passes for existing controls.
|
||||
|
||||
Pass 1: Obligation Linkage (deterministic, article→obligation lookup)
|
||||
Pass 2: Pattern Classification (keyword-based matching)
|
||||
Pass 3: Quality Triage (categorize by linkage completeness)
|
||||
Pass 4: Crosswalk Backfill (write crosswalk rows for linked controls)
|
||||
Pass 5: Deduplication (mark duplicate controls)
|
||||
|
||||
Usage::
|
||||
|
||||
migration = MigrationPasses(db)
|
||||
await migration.initialize()
|
||||
|
||||
result = await migration.run_pass1_obligation_linkage(limit=100)
|
||||
result = await migration.run_pass2_pattern_classification(limit=100)
|
||||
result = migration.run_pass3_quality_triage()
|
||||
result = migration.run_pass4_crosswalk_backfill()
|
||||
result = migration.run_pass5_deduplication()
|
||||
"""
|
||||
|
||||
def __init__(self, db: Session):
|
||||
self.db = db
|
||||
self._extractor = ObligationExtractor()
|
||||
self._matcher = PatternMatcher()
|
||||
self._initialized = False
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize extractors (loads obligations + patterns)."""
|
||||
if self._initialized:
|
||||
return
|
||||
self._extractor._load_obligations()
|
||||
self._matcher._load_patterns()
|
||||
self._matcher._build_keyword_index()
|
||||
self._initialized = True
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Pass 1: Obligation Linkage (deterministic)
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
async def run_pass1_obligation_linkage(self, limit: int = 0) -> dict:
|
||||
"""Link existing controls to obligations via source_citation article.
|
||||
|
||||
For each control with source_citation → extract regulation + article
|
||||
→ look up in obligation framework → set obligation_ids.
|
||||
"""
|
||||
if not self._initialized:
|
||||
await self.initialize()
|
||||
|
||||
query = """
|
||||
SELECT id, control_id, source_citation, generation_metadata
|
||||
FROM canonical_controls
|
||||
WHERE release_state NOT IN ('deprecated')
|
||||
AND (obligation_ids IS NULL OR obligation_ids = '[]')
|
||||
"""
|
||||
if limit > 0:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
rows = self.db.execute(text(query)).fetchall()
|
||||
|
||||
stats = {"total": len(rows), "linked": 0, "no_match": 0, "no_citation": 0}
|
||||
|
||||
for row in rows:
|
||||
control_uuid = str(row[0])
|
||||
control_id = row[1]
|
||||
citation = row[2]
|
||||
metadata = row[3]
|
||||
|
||||
# Extract regulation + article from citation or metadata
|
||||
reg_code, article = _extract_regulation_article(citation, metadata)
|
||||
if not reg_code:
|
||||
stats["no_citation"] += 1
|
||||
continue
|
||||
|
||||
# Tier 1: Exact match
|
||||
match = self._extractor._tier1_exact(reg_code, article or "")
|
||||
if match and match.obligation_id:
|
||||
self.db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET obligation_ids = :obl_ids
|
||||
WHERE id = CAST(:uuid AS uuid)
|
||||
"""),
|
||||
{
|
||||
"obl_ids": json.dumps([match.obligation_id]),
|
||||
"uuid": control_uuid,
|
||||
},
|
||||
)
|
||||
stats["linked"] += 1
|
||||
else:
|
||||
stats["no_match"] += 1
|
||||
|
||||
self.db.commit()
|
||||
logger.info("Pass 1: %s", stats)
|
||||
return stats
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Pass 2: Pattern Classification (keyword-based)
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
async def run_pass2_pattern_classification(self, limit: int = 0) -> dict:
|
||||
"""Classify existing controls into patterns via keyword matching.
|
||||
|
||||
For each control without pattern_id → keyword-match title+objective
|
||||
against pattern library → assign best match.
|
||||
"""
|
||||
if not self._initialized:
|
||||
await self.initialize()
|
||||
|
||||
query = """
|
||||
SELECT id, control_id, title, objective
|
||||
FROM canonical_controls
|
||||
WHERE release_state NOT IN ('deprecated')
|
||||
AND (pattern_id IS NULL OR pattern_id = '')
|
||||
"""
|
||||
if limit > 0:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
rows = self.db.execute(text(query)).fetchall()
|
||||
|
||||
stats = {"total": len(rows), "classified": 0, "no_match": 0}
|
||||
|
||||
for row in rows:
|
||||
control_uuid = str(row[0])
|
||||
title = row[2] or ""
|
||||
objective = row[3] or ""
|
||||
|
||||
# Keyword match
|
||||
match_text = f"{title} {objective}"
|
||||
result = self._matcher._tier1_keyword(match_text, None)
|
||||
|
||||
if result and result.pattern_id and result.keyword_hits >= 2:
|
||||
self.db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET pattern_id = :pattern_id
|
||||
WHERE id = CAST(:uuid AS uuid)
|
||||
"""),
|
||||
{
|
||||
"pattern_id": result.pattern_id,
|
||||
"uuid": control_uuid,
|
||||
},
|
||||
)
|
||||
stats["classified"] += 1
|
||||
else:
|
||||
stats["no_match"] += 1
|
||||
|
||||
self.db.commit()
|
||||
logger.info("Pass 2: %s", stats)
|
||||
return stats
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Pass 3: Quality Triage
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
def run_pass3_quality_triage(self) -> dict:
|
||||
"""Categorize controls by linkage completeness.
|
||||
|
||||
Sets generation_metadata.triage_status:
|
||||
- "review": has both obligation_id + pattern_id
|
||||
- "needs_obligation": has pattern_id but no obligation_id
|
||||
- "needs_pattern": has obligation_id but no pattern_id
|
||||
- "legacy_unlinked": has neither
|
||||
"""
|
||||
categories = {
|
||||
"review": """
|
||||
UPDATE canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
|
||||
'{triage_status}', '"review"'
|
||||
)
|
||||
WHERE release_state NOT IN ('deprecated')
|
||||
AND obligation_ids IS NOT NULL AND obligation_ids != '[]'
|
||||
AND pattern_id IS NOT NULL AND pattern_id != ''
|
||||
""",
|
||||
"needs_obligation": """
|
||||
UPDATE canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
|
||||
'{triage_status}', '"needs_obligation"'
|
||||
)
|
||||
WHERE release_state NOT IN ('deprecated')
|
||||
AND (obligation_ids IS NULL OR obligation_ids = '[]')
|
||||
AND pattern_id IS NOT NULL AND pattern_id != ''
|
||||
""",
|
||||
"needs_pattern": """
|
||||
UPDATE canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
|
||||
'{triage_status}', '"needs_pattern"'
|
||||
)
|
||||
WHERE release_state NOT IN ('deprecated')
|
||||
AND obligation_ids IS NOT NULL AND obligation_ids != '[]'
|
||||
AND (pattern_id IS NULL OR pattern_id = '')
|
||||
""",
|
||||
"legacy_unlinked": """
|
||||
UPDATE canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
|
||||
'{triage_status}', '"legacy_unlinked"'
|
||||
)
|
||||
WHERE release_state NOT IN ('deprecated')
|
||||
AND (obligation_ids IS NULL OR obligation_ids = '[]')
|
||||
AND (pattern_id IS NULL OR pattern_id = '')
|
||||
""",
|
||||
}
|
||||
|
||||
stats = {}
|
||||
for category, sql in categories.items():
|
||||
result = self.db.execute(text(sql))
|
||||
stats[category] = result.rowcount
|
||||
|
||||
self.db.commit()
|
||||
logger.info("Pass 3: %s", stats)
|
||||
return stats
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Pass 4: Crosswalk Backfill
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
def run_pass4_crosswalk_backfill(self) -> dict:
|
||||
"""Create crosswalk_matrix rows for controls with obligation + pattern.
|
||||
|
||||
Only creates rows that don't already exist.
|
||||
"""
|
||||
result = self.db.execute(text("""
|
||||
INSERT INTO crosswalk_matrix (
|
||||
regulation_code, obligation_id, pattern_id,
|
||||
master_control_id, master_control_uuid,
|
||||
confidence, source
|
||||
)
|
||||
SELECT
|
||||
COALESCE(
|
||||
(generation_metadata::jsonb->>'source_regulation'),
|
||||
''
|
||||
) AS regulation_code,
|
||||
obl.value::text AS obligation_id,
|
||||
cc.pattern_id,
|
||||
cc.control_id,
|
||||
cc.id,
|
||||
0.80,
|
||||
'migrated'
|
||||
FROM canonical_controls cc,
|
||||
jsonb_array_elements_text(
|
||||
COALESCE(cc.obligation_ids::jsonb, '[]'::jsonb)
|
||||
) AS obl(value)
|
||||
WHERE cc.release_state NOT IN ('deprecated')
|
||||
AND cc.pattern_id IS NOT NULL AND cc.pattern_id != ''
|
||||
AND cc.obligation_ids IS NOT NULL AND cc.obligation_ids != '[]'
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crosswalk_matrix cw
|
||||
WHERE cw.master_control_uuid = cc.id
|
||||
AND cw.obligation_id = obl.value::text
|
||||
)
|
||||
"""))
|
||||
|
||||
rows_inserted = result.rowcount
|
||||
self.db.commit()
|
||||
logger.info("Pass 4: %d crosswalk rows inserted", rows_inserted)
|
||||
return {"rows_inserted": rows_inserted}
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Pass 5: Deduplication
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
def run_pass5_deduplication(self) -> dict:
|
||||
"""Mark duplicate controls (same obligation + same pattern).
|
||||
|
||||
Groups controls by (obligation_id, pattern_id), keeps the one with
|
||||
highest evidence_confidence (or newest), marks rest as deprecated.
|
||||
"""
|
||||
# Find groups with duplicates
|
||||
groups = self.db.execute(text("""
|
||||
SELECT cc.pattern_id,
|
||||
obl.value::text AS obligation_id,
|
||||
array_agg(cc.id ORDER BY cc.evidence_confidence DESC NULLS LAST, cc.created_at DESC) AS ids,
|
||||
count(*) AS cnt
|
||||
FROM canonical_controls cc,
|
||||
jsonb_array_elements_text(
|
||||
COALESCE(cc.obligation_ids::jsonb, '[]'::jsonb)
|
||||
) AS obl(value)
|
||||
WHERE cc.release_state NOT IN ('deprecated')
|
||||
AND cc.pattern_id IS NOT NULL AND cc.pattern_id != ''
|
||||
GROUP BY cc.pattern_id, obl.value::text
|
||||
HAVING count(*) > 1
|
||||
""")).fetchall()
|
||||
|
||||
stats = {"groups_found": len(groups), "controls_deprecated": 0}
|
||||
|
||||
for group in groups:
|
||||
ids = group[2] # Array of UUIDs, first is the keeper
|
||||
if len(ids) <= 1:
|
||||
continue
|
||||
|
||||
# Keep first (highest confidence), deprecate rest
|
||||
deprecate_ids = ids[1:]
|
||||
for dep_id in deprecate_ids:
|
||||
self.db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET release_state = 'deprecated',
|
||||
generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata::jsonb, '{}'::jsonb),
|
||||
'{deprecated_reason}', '"duplicate_same_obligation_pattern"'
|
||||
)
|
||||
WHERE id = CAST(:uuid AS uuid)
|
||||
AND release_state != 'deprecated'
|
||||
"""),
|
||||
{"uuid": str(dep_id)},
|
||||
)
|
||||
stats["controls_deprecated"] += 1
|
||||
|
||||
self.db.commit()
|
||||
logger.info("Pass 5: %s", stats)
|
||||
return stats
|
||||
|
||||
def migration_status(self) -> dict:
|
||||
"""Return overall migration progress."""
|
||||
row = self.db.execute(text("""
|
||||
SELECT
|
||||
count(*) AS total,
|
||||
count(*) FILTER (WHERE obligation_ids IS NOT NULL AND obligation_ids != '[]') AS has_obligation,
|
||||
count(*) FILTER (WHERE pattern_id IS NOT NULL AND pattern_id != '') AS has_pattern,
|
||||
count(*) FILTER (
|
||||
WHERE obligation_ids IS NOT NULL AND obligation_ids != '[]'
|
||||
AND pattern_id IS NOT NULL AND pattern_id != ''
|
||||
) AS fully_linked,
|
||||
count(*) FILTER (WHERE release_state = 'deprecated') AS deprecated
|
||||
FROM canonical_controls
|
||||
""")).fetchone()
|
||||
|
||||
return {
|
||||
"total_controls": row[0],
|
||||
"has_obligation": row[1],
|
||||
"has_pattern": row[2],
|
||||
"fully_linked": row[3],
|
||||
"deprecated": row[4],
|
||||
"coverage_obligation_pct": round(row[1] / max(row[0], 1) * 100, 1),
|
||||
"coverage_pattern_pct": round(row[2] / max(row[0], 1) * 100, 1),
|
||||
"coverage_full_pct": round(row[3] / max(row[0], 1) * 100, 1),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _extract_regulation_article(
|
||||
citation: Optional[str], metadata: Optional[str]
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Extract regulation_code and article from control's citation/metadata."""
|
||||
from compliance.services.obligation_extractor import _normalize_regulation
|
||||
|
||||
reg_code = None
|
||||
article = None
|
||||
|
||||
# Try citation first (JSON string or dict)
|
||||
if citation:
|
||||
try:
|
||||
c = json.loads(citation) if isinstance(citation, str) else citation
|
||||
if isinstance(c, dict):
|
||||
article = c.get("article") or c.get("source_article")
|
||||
# Try to get regulation from source field
|
||||
source = c.get("source", "")
|
||||
if source:
|
||||
reg_code = _normalize_regulation(source)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
# Try metadata
|
||||
if metadata and not reg_code:
|
||||
try:
|
||||
m = json.loads(metadata) if isinstance(metadata, str) else metadata
|
||||
if isinstance(m, dict):
|
||||
src_reg = m.get("source_regulation", "")
|
||||
if src_reg:
|
||||
reg_code = _normalize_regulation(src_reg)
|
||||
if not article:
|
||||
article = m.get("source_article")
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
return reg_code, article
|
||||
@@ -33,6 +33,7 @@ class RAGSearchResult:
|
||||
paragraph: str
|
||||
source_url: str
|
||||
score: float
|
||||
collection: str = ""
|
||||
|
||||
|
||||
class ComplianceRAGClient:
|
||||
@@ -91,6 +92,7 @@ class ComplianceRAGClient:
|
||||
paragraph=r.get("paragraph", ""),
|
||||
source_url=r.get("source_url", ""),
|
||||
score=r.get("score", 0.0),
|
||||
collection=collection,
|
||||
))
|
||||
return results
|
||||
|
||||
@@ -98,6 +100,88 @@ class ComplianceRAGClient:
|
||||
logger.warning("RAG search failed: %s", e)
|
||||
return []
|
||||
|
||||
async def search_with_rerank(
|
||||
self,
|
||||
query: str,
|
||||
collection: str = "bp_compliance_ce",
|
||||
regulations: Optional[List[str]] = None,
|
||||
top_k: int = 5,
|
||||
) -> List[RAGSearchResult]:
|
||||
"""
|
||||
Search with optional cross-encoder re-ranking.
|
||||
|
||||
Fetches top_k*4 results from RAG, then re-ranks with cross-encoder
|
||||
and returns top_k. Falls back to regular search if reranker is disabled.
|
||||
"""
|
||||
from .reranker import get_reranker
|
||||
|
||||
reranker = get_reranker()
|
||||
if reranker is None:
|
||||
return await self.search(query, collection, regulations, top_k)
|
||||
|
||||
# Fetch more candidates for re-ranking
|
||||
candidates = await self.search(
|
||||
query, collection, regulations, top_k=max(top_k * 4, 20)
|
||||
)
|
||||
if not candidates:
|
||||
return []
|
||||
|
||||
texts = [c.text for c in candidates]
|
||||
try:
|
||||
ranked_indices = reranker.rerank(query, texts, top_k=top_k)
|
||||
return [candidates[i] for i in ranked_indices]
|
||||
except Exception as e:
|
||||
logger.warning("Reranking failed, returning unranked: %s", e)
|
||||
return candidates[:top_k]
|
||||
|
||||
async def scroll(
|
||||
self,
|
||||
collection: str,
|
||||
offset: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
) -> tuple[List[RAGSearchResult], Optional[str]]:
|
||||
"""
|
||||
Scroll through ALL chunks in a collection (paginated).
|
||||
|
||||
Returns (chunks, next_offset). next_offset is None when done.
|
||||
"""
|
||||
scroll_url = self._search_url.replace("/search", "/scroll")
|
||||
params = {"collection": collection, "limit": str(limit)}
|
||||
if offset:
|
||||
params["offset"] = offset
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.get(scroll_url, params=params)
|
||||
|
||||
if resp.status_code != 200:
|
||||
logger.warning(
|
||||
"RAG scroll returned %d: %s", resp.status_code, resp.text[:200]
|
||||
)
|
||||
return [], None
|
||||
|
||||
data = resp.json()
|
||||
results = []
|
||||
for r in data.get("chunks", []):
|
||||
results.append(RAGSearchResult(
|
||||
text=r.get("text", ""),
|
||||
regulation_code=r.get("regulation_code", ""),
|
||||
regulation_name=r.get("regulation_name", ""),
|
||||
regulation_short=r.get("regulation_short", ""),
|
||||
category=r.get("category", ""),
|
||||
article=r.get("article", ""),
|
||||
paragraph=r.get("paragraph", ""),
|
||||
source_url=r.get("source_url", ""),
|
||||
score=0.0,
|
||||
collection=collection,
|
||||
))
|
||||
next_offset = data.get("next_offset") or None
|
||||
return results, next_offset
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("RAG scroll failed: %s", e)
|
||||
return [], None
|
||||
|
||||
def format_for_prompt(
|
||||
self, results: List[RAGSearchResult], max_results: int = 5
|
||||
) -> str:
|
||||
|
||||
85
backend-compliance/compliance/services/reranker.py
Normal file
85
backend-compliance/compliance/services/reranker.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Cross-Encoder Re-Ranking for RAG Search Results.
|
||||
|
||||
Uses BGE Reranker v2 (BAAI/bge-reranker-v2-m3, MIT license) to re-rank
|
||||
search results from Qdrant for improved retrieval quality.
|
||||
|
||||
Lazy-loads the model on first use. Disabled by default (RERANK_ENABLED=false).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
RERANK_ENABLED = os.getenv("RERANK_ENABLED", "false").lower() == "true"
|
||||
RERANK_MODEL = os.getenv("RERANK_MODEL", "BAAI/bge-reranker-v2-m3")
|
||||
|
||||
|
||||
class Reranker:
|
||||
"""Cross-encoder reranker using sentence-transformers."""
|
||||
|
||||
def __init__(self, model_name: str = RERANK_MODEL):
|
||||
self._model = None # Lazy init
|
||||
self._model_name = model_name
|
||||
|
||||
def _ensure_model(self) -> None:
|
||||
"""Load model on first use."""
|
||||
if self._model is not None:
|
||||
return
|
||||
try:
|
||||
from sentence_transformers import CrossEncoder
|
||||
|
||||
logger.info("Loading reranker model: %s", self._model_name)
|
||||
self._model = CrossEncoder(self._model_name)
|
||||
logger.info("Reranker model loaded successfully")
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"sentence-transformers not installed. "
|
||||
"Install with: pip install sentence-transformers"
|
||||
)
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to load reranker model: %s", e)
|
||||
raise
|
||||
|
||||
def rerank(
|
||||
self, query: str, texts: list[str], top_k: int = 5
|
||||
) -> list[int]:
|
||||
"""
|
||||
Return indices of top_k texts sorted by relevance (highest first).
|
||||
|
||||
Args:
|
||||
query: The search query.
|
||||
texts: List of candidate texts to re-rank.
|
||||
top_k: Number of top results to return.
|
||||
|
||||
Returns:
|
||||
List of indices into the original texts list, sorted by relevance.
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
self._ensure_model()
|
||||
|
||||
pairs = [[query, text] for text in texts]
|
||||
scores = self._model.predict(pairs)
|
||||
|
||||
# Sort by score descending, return indices
|
||||
ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
|
||||
return ranked[:top_k]
|
||||
|
||||
|
||||
# Module-level singleton
|
||||
_reranker: Optional[Reranker] = None
|
||||
|
||||
|
||||
def get_reranker() -> Optional[Reranker]:
|
||||
"""Get the shared reranker instance. Returns None if disabled."""
|
||||
global _reranker
|
||||
if not RERANK_ENABLED:
|
||||
return None
|
||||
if _reranker is None:
|
||||
_reranker = Reranker()
|
||||
return _reranker
|
||||
331
backend-compliance/compliance/services/v1_enrichment.py
Normal file
331
backend-compliance/compliance/services/v1_enrichment.py
Normal file
@@ -0,0 +1,331 @@
|
||||
"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations.
|
||||
|
||||
Finds regulatory coverage for v1 controls (generation_strategy='ungrouped',
|
||||
pipeline_version=1, no source_citation) by embedding similarity search.
|
||||
|
||||
Reuses embedding + Qdrant helpers from control_dedup.py.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from database import SessionLocal
|
||||
from compliance.services.control_dedup import (
|
||||
get_embedding,
|
||||
qdrant_search_cross_regulation,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Similarity threshold — lower than dedup (0.85) since we want informational matches
|
||||
# Typical top scores for v1 controls are 0.70-0.77
|
||||
V1_MATCH_THRESHOLD = 0.70
|
||||
V1_MAX_MATCHES = 5
|
||||
|
||||
|
||||
def _is_eigenentwicklung_query() -> str:
|
||||
"""SQL WHERE clause identifying v1 Eigenentwicklung controls."""
|
||||
return """
|
||||
generation_strategy = 'ungrouped'
|
||||
AND (pipeline_version = '1' OR pipeline_version IS NULL)
|
||||
AND source_citation IS NULL
|
||||
AND parent_control_uuid IS NULL
|
||||
AND release_state NOT IN ('rejected', 'merged', 'deprecated')
|
||||
"""
|
||||
|
||||
|
||||
async def count_v1_controls() -> int:
|
||||
"""Count how many v1 Eigenentwicklung controls exist."""
|
||||
with SessionLocal() as db:
|
||||
row = db.execute(text(f"""
|
||||
SELECT COUNT(*) AS cnt
|
||||
FROM canonical_controls
|
||||
WHERE {_is_eigenentwicklung_query()}
|
||||
""")).fetchone()
|
||||
return row.cnt if row else 0
|
||||
|
||||
|
||||
async def enrich_v1_matches(
|
||||
dry_run: bool = True,
|
||||
batch_size: int = 100,
|
||||
offset: int = 0,
|
||||
) -> dict:
|
||||
"""Find regulatory matches for v1 Eigenentwicklung controls.
|
||||
|
||||
Args:
|
||||
dry_run: If True, only count — don't write matches.
|
||||
batch_size: Number of v1 controls to process per call.
|
||||
offset: Pagination offset (v1 control index).
|
||||
|
||||
Returns:
|
||||
Stats dict with counts, sample matches, and pagination info.
|
||||
"""
|
||||
with SessionLocal() as db:
|
||||
# 1. Load v1 controls (paginated)
|
||||
v1_controls = db.execute(text(f"""
|
||||
SELECT id, control_id, title, objective, category
|
||||
FROM canonical_controls
|
||||
WHERE {_is_eigenentwicklung_query()}
|
||||
ORDER BY control_id
|
||||
LIMIT :limit OFFSET :offset
|
||||
"""), {"limit": batch_size, "offset": offset}).fetchall()
|
||||
|
||||
# Count total for pagination
|
||||
total_row = db.execute(text(f"""
|
||||
SELECT COUNT(*) AS cnt
|
||||
FROM canonical_controls
|
||||
WHERE {_is_eigenentwicklung_query()}
|
||||
""")).fetchone()
|
||||
total_v1 = total_row.cnt if total_row else 0
|
||||
|
||||
if not v1_controls:
|
||||
return {
|
||||
"dry_run": dry_run,
|
||||
"processed": 0,
|
||||
"total_v1": total_v1,
|
||||
"message": "Kein weiterer Batch — alle v1 Controls verarbeitet.",
|
||||
}
|
||||
|
||||
if dry_run:
|
||||
return {
|
||||
"dry_run": True,
|
||||
"total_v1": total_v1,
|
||||
"offset": offset,
|
||||
"batch_size": batch_size,
|
||||
"sample_controls": [
|
||||
{
|
||||
"control_id": r.control_id,
|
||||
"title": r.title,
|
||||
"category": r.category,
|
||||
}
|
||||
for r in v1_controls[:20]
|
||||
],
|
||||
}
|
||||
|
||||
# 2. Process each v1 control
|
||||
processed = 0
|
||||
matches_inserted = 0
|
||||
errors = []
|
||||
sample_matches = []
|
||||
|
||||
for v1 in v1_controls:
|
||||
try:
|
||||
# Build search text
|
||||
search_text = f"{v1.title} — {v1.objective}"
|
||||
|
||||
# Get embedding
|
||||
embedding = await get_embedding(search_text)
|
||||
if not embedding:
|
||||
errors.append({
|
||||
"control_id": v1.control_id,
|
||||
"error": "Embedding fehlgeschlagen",
|
||||
})
|
||||
continue
|
||||
|
||||
# Search Qdrant (cross-regulation, no pattern filter)
|
||||
# Collection is atomic_controls_dedup (contains ~51k atomare Controls)
|
||||
results = await qdrant_search_cross_regulation(
|
||||
embedding, top_k=20,
|
||||
collection="atomic_controls_dedup",
|
||||
)
|
||||
|
||||
# For each hit: resolve to a regulatory parent with source_citation.
|
||||
# Atomic controls in Qdrant usually have parent_control_uuid → parent
|
||||
# has the source_citation. We deduplicate by parent to avoid
|
||||
# listing the same regulation multiple times.
|
||||
rank = 0
|
||||
seen_parents: set[str] = set()
|
||||
|
||||
for hit in results:
|
||||
score = hit.get("score", 0)
|
||||
if score < V1_MATCH_THRESHOLD:
|
||||
continue
|
||||
|
||||
payload = hit.get("payload", {})
|
||||
matched_uuid = payload.get("control_uuid")
|
||||
if not matched_uuid or matched_uuid == str(v1.id):
|
||||
continue
|
||||
|
||||
# Try the matched control itself first, then its parent
|
||||
matched_row = db.execute(text("""
|
||||
SELECT c.id, c.control_id, c.title, c.source_citation,
|
||||
c.severity, c.category, c.parent_control_uuid
|
||||
FROM canonical_controls c
|
||||
WHERE c.id = CAST(:uuid AS uuid)
|
||||
"""), {"uuid": matched_uuid}).fetchone()
|
||||
|
||||
if not matched_row:
|
||||
continue
|
||||
|
||||
# Resolve to regulatory control (one with source_citation)
|
||||
reg_row = matched_row
|
||||
if not reg_row.source_citation and reg_row.parent_control_uuid:
|
||||
# Look up parent — the parent has the source_citation
|
||||
parent_row = db.execute(text("""
|
||||
SELECT id, control_id, title, source_citation,
|
||||
severity, category, parent_control_uuid
|
||||
FROM canonical_controls
|
||||
WHERE id = CAST(:uuid AS uuid)
|
||||
AND source_citation IS NOT NULL
|
||||
"""), {"uuid": str(reg_row.parent_control_uuid)}).fetchone()
|
||||
if parent_row:
|
||||
reg_row = parent_row
|
||||
|
||||
if not reg_row.source_citation:
|
||||
continue
|
||||
|
||||
# Deduplicate by parent UUID
|
||||
parent_key = str(reg_row.id)
|
||||
if parent_key in seen_parents:
|
||||
continue
|
||||
seen_parents.add(parent_key)
|
||||
|
||||
rank += 1
|
||||
if rank > V1_MAX_MATCHES:
|
||||
break
|
||||
|
||||
# Extract source info
|
||||
source_citation = reg_row.source_citation or {}
|
||||
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
|
||||
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
|
||||
|
||||
# Insert match — link to the regulatory parent (not the atomic child)
|
||||
db.execute(text("""
|
||||
INSERT INTO v1_control_matches
|
||||
(v1_control_uuid, matched_control_uuid, similarity_score,
|
||||
match_rank, matched_source, matched_article, match_method)
|
||||
VALUES
|
||||
(CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score,
|
||||
:rank, :source, :article, 'embedding')
|
||||
ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE
|
||||
SET similarity_score = EXCLUDED.similarity_score,
|
||||
match_rank = EXCLUDED.match_rank
|
||||
"""), {
|
||||
"v1_uuid": str(v1.id),
|
||||
"matched_uuid": str(reg_row.id),
|
||||
"score": round(score, 3),
|
||||
"rank": rank,
|
||||
"source": matched_source,
|
||||
"article": matched_article,
|
||||
})
|
||||
matches_inserted += 1
|
||||
|
||||
# Collect sample
|
||||
if len(sample_matches) < 20:
|
||||
sample_matches.append({
|
||||
"v1_control_id": v1.control_id,
|
||||
"v1_title": v1.title,
|
||||
"matched_control_id": reg_row.control_id,
|
||||
"matched_title": reg_row.title,
|
||||
"matched_source": matched_source,
|
||||
"matched_article": matched_article,
|
||||
"similarity_score": round(score, 3),
|
||||
"match_rank": rank,
|
||||
})
|
||||
|
||||
processed += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("V1 enrichment error for %s: %s", v1.control_id, e)
|
||||
errors.append({
|
||||
"control_id": v1.control_id,
|
||||
"error": str(e),
|
||||
})
|
||||
|
||||
db.commit()
|
||||
|
||||
# Pagination
|
||||
next_offset = offset + batch_size if len(v1_controls) == batch_size else None
|
||||
|
||||
return {
|
||||
"dry_run": False,
|
||||
"offset": offset,
|
||||
"batch_size": batch_size,
|
||||
"next_offset": next_offset,
|
||||
"total_v1": total_v1,
|
||||
"processed": processed,
|
||||
"matches_inserted": matches_inserted,
|
||||
"errors": errors[:10],
|
||||
"sample_matches": sample_matches,
|
||||
}
|
||||
|
||||
|
||||
async def get_v1_matches(control_uuid: str) -> list[dict]:
|
||||
"""Get all regulatory matches for a specific v1 control.
|
||||
|
||||
Args:
|
||||
control_uuid: The UUID of the v1 control.
|
||||
|
||||
Returns:
|
||||
List of match dicts with control details.
|
||||
"""
|
||||
with SessionLocal() as db:
|
||||
rows = db.execute(text("""
|
||||
SELECT
|
||||
m.similarity_score,
|
||||
m.match_rank,
|
||||
m.matched_source,
|
||||
m.matched_article,
|
||||
m.match_method,
|
||||
c.control_id AS matched_control_id,
|
||||
c.title AS matched_title,
|
||||
c.objective AS matched_objective,
|
||||
c.severity AS matched_severity,
|
||||
c.category AS matched_category,
|
||||
c.source_citation AS matched_source_citation
|
||||
FROM v1_control_matches m
|
||||
JOIN canonical_controls c ON c.id = m.matched_control_uuid
|
||||
WHERE m.v1_control_uuid = CAST(:uuid AS uuid)
|
||||
ORDER BY m.match_rank
|
||||
"""), {"uuid": control_uuid}).fetchall()
|
||||
|
||||
return [
|
||||
{
|
||||
"matched_control_id": r.matched_control_id,
|
||||
"matched_title": r.matched_title,
|
||||
"matched_objective": r.matched_objective,
|
||||
"matched_severity": r.matched_severity,
|
||||
"matched_category": r.matched_category,
|
||||
"matched_source": r.matched_source,
|
||||
"matched_article": r.matched_article,
|
||||
"matched_source_citation": r.matched_source_citation,
|
||||
"similarity_score": float(r.similarity_score),
|
||||
"match_rank": r.match_rank,
|
||||
"match_method": r.match_method,
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
async def get_v1_enrichment_stats() -> dict:
|
||||
"""Get overview stats for v1 enrichment."""
|
||||
with SessionLocal() as db:
|
||||
total_v1 = db.execute(text(f"""
|
||||
SELECT COUNT(*) AS cnt FROM canonical_controls
|
||||
WHERE {_is_eigenentwicklung_query()}
|
||||
""")).fetchone()
|
||||
|
||||
matched_v1 = db.execute(text(f"""
|
||||
SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt
|
||||
FROM v1_control_matches m
|
||||
JOIN canonical_controls c ON c.id = m.v1_control_uuid
|
||||
WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')}
|
||||
""")).fetchone()
|
||||
|
||||
total_matches = db.execute(text("""
|
||||
SELECT COUNT(*) AS cnt FROM v1_control_matches
|
||||
""")).fetchone()
|
||||
|
||||
avg_score = db.execute(text("""
|
||||
SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches
|
||||
""")).fetchone()
|
||||
|
||||
return {
|
||||
"total_v1_controls": total_v1.cnt if total_v1 else 0,
|
||||
"v1_with_matches": matched_v1.cnt if matched_v1 else 0,
|
||||
"v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0),
|
||||
"total_matches": total_matches.cnt if total_matches else 0,
|
||||
"avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None,
|
||||
}
|
||||
Reference in New Issue
Block a user