backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
201 lines
7.2 KiB
Python
201 lines
7.2 KiB
Python
"""
|
|
Compliance Extraction & Generation.
|
|
|
|
Functions for extracting checkpoints from legal text chunks,
|
|
generating controls, and creating remediation measures.
|
|
"""
|
|
|
|
import re
|
|
import hashlib
|
|
import logging
|
|
from typing import Dict, List, Optional
|
|
|
|
from compliance_models import Checkpoint, Control, Measure
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def extract_checkpoints_from_chunk(chunk_text: str, payload: Dict) -> List[Checkpoint]:
|
|
"""
|
|
Extract checkpoints/requirements from a chunk of text.
|
|
|
|
Uses pattern matching to find requirement-like statements.
|
|
"""
|
|
checkpoints = []
|
|
regulation_code = payload.get("regulation_code", "UNKNOWN")
|
|
regulation_name = payload.get("regulation_name", "Unknown")
|
|
source_url = payload.get("source_url", "")
|
|
chunk_id = hashlib.md5(chunk_text[:100].encode()).hexdigest()[:8]
|
|
|
|
# Patterns for different requirement types
|
|
patterns = [
|
|
# BSI-TR patterns
|
|
(r'([OT]\.[A-Za-z_]+\d*)[:\s]+(.+?)(?=\n[OT]\.|$)', 'bsi_requirement'),
|
|
# Article patterns (GDPR, AI Act, etc.)
|
|
(r'(?:Artikel|Art\.?)\s+(\d+)(?:\s+Abs(?:atz)?\.?\s*(\d+))?\s*[-\u2013:]\s*(.+?)(?=\n|$)', 'article'),
|
|
# Numbered requirements
|
|
(r'\((\d+)\)\s+(.+?)(?=\n\(\d+\)|$)', 'numbered'),
|
|
# "Der Verantwortliche muss" patterns
|
|
(r'(?:Der Verantwortliche|Die Aufsichtsbeh\u00f6rde|Der Auftragsverarbeiter)\s+(muss|hat|soll)\s+(.+?)(?=\.\s|$)', 'obligation'),
|
|
# "Es ist erforderlich" patterns
|
|
(r'(?:Es ist erforderlich|Es muss gew\u00e4hrleistet|Es sind geeignete)\s+(.+?)(?=\.\s|$)', 'requirement'),
|
|
]
|
|
|
|
for pattern, pattern_type in patterns:
|
|
matches = re.finditer(pattern, chunk_text, re.MULTILINE | re.DOTALL)
|
|
for match in matches:
|
|
if pattern_type == 'bsi_requirement':
|
|
req_id = match.group(1)
|
|
description = match.group(2).strip()
|
|
title = req_id
|
|
elif pattern_type == 'article':
|
|
article_num = match.group(1)
|
|
paragraph = match.group(2) or ""
|
|
title_text = match.group(3).strip()
|
|
req_id = f"{regulation_code}-Art{article_num}"
|
|
if paragraph:
|
|
req_id += f"-{paragraph}"
|
|
title = f"Art. {article_num}" + (f" Abs. {paragraph}" if paragraph else "")
|
|
description = title_text
|
|
elif pattern_type == 'numbered':
|
|
num = match.group(1)
|
|
description = match.group(2).strip()
|
|
req_id = f"{regulation_code}-{num}"
|
|
title = f"Anforderung {num}"
|
|
else:
|
|
# Generic requirement
|
|
description = match.group(0).strip()
|
|
req_id = f"{regulation_code}-{chunk_id}-{len(checkpoints)}"
|
|
title = description[:50] + "..." if len(description) > 50 else description
|
|
|
|
# Skip very short matches
|
|
if len(description) < 20:
|
|
continue
|
|
|
|
checkpoint = Checkpoint(
|
|
id=req_id,
|
|
regulation_code=regulation_code,
|
|
regulation_name=regulation_name,
|
|
article=title if 'Art' in title else None,
|
|
title=title,
|
|
description=description[:500],
|
|
original_text=description,
|
|
chunk_id=chunk_id,
|
|
source_url=source_url
|
|
)
|
|
checkpoints.append(checkpoint)
|
|
|
|
return checkpoints
|
|
|
|
|
|
def generate_control_for_checkpoints(
|
|
checkpoints: List[Checkpoint],
|
|
domain_counts: Dict[str, int],
|
|
) -> Optional[Control]:
|
|
"""
|
|
Generate a control that covers the given checkpoints.
|
|
|
|
This is a simplified version - in production this would use the AI assistant.
|
|
"""
|
|
if not checkpoints:
|
|
return None
|
|
|
|
# Group by regulation
|
|
regulation = checkpoints[0].regulation_code
|
|
|
|
# Determine domain based on content
|
|
all_text = " ".join([cp.description for cp in checkpoints]).lower()
|
|
|
|
domain = "gov" # Default
|
|
if any(kw in all_text for kw in ["verschl\u00fcssel", "krypto", "encrypt", "hash"]):
|
|
domain = "crypto"
|
|
elif any(kw in all_text for kw in ["zugang", "access", "authentif", "login", "benutzer"]):
|
|
domain = "iam"
|
|
elif any(kw in all_text for kw in ["datenschutz", "personenbezogen", "privacy", "einwilligung"]):
|
|
domain = "priv"
|
|
elif any(kw in all_text for kw in ["entwicklung", "test", "code", "software"]):
|
|
domain = "sdlc"
|
|
elif any(kw in all_text for kw in ["\u00fcberwach", "monitor", "log", "audit"]):
|
|
domain = "aud"
|
|
elif any(kw in all_text for kw in ["ki", "k\u00fcnstlich", "ai", "machine learning", "model"]):
|
|
domain = "ai"
|
|
elif any(kw in all_text for kw in ["betrieb", "operation", "verf\u00fcgbar", "backup"]):
|
|
domain = "ops"
|
|
elif any(kw in all_text for kw in ["cyber", "resilience", "sbom", "vulnerab"]):
|
|
domain = "cra"
|
|
|
|
# Generate control ID
|
|
domain_count = domain_counts.get(domain, 0) + 1
|
|
control_id = f"{domain.upper()}-{domain_count:03d}"
|
|
|
|
# Create title from first checkpoint
|
|
title = checkpoints[0].title
|
|
if len(title) > 100:
|
|
title = title[:97] + "..."
|
|
|
|
# Create description
|
|
description = f"Control f\u00fcr {regulation}: " + checkpoints[0].description[:200]
|
|
|
|
# Pass criteria
|
|
pass_criteria = f"Alle {len(checkpoints)} zugeh\u00f6rigen Anforderungen sind erf\u00fcllt und dokumentiert."
|
|
|
|
# Implementation guidance
|
|
guidance = f"Implementiere Ma\u00dfnahmen zur Erf\u00fcllung der Anforderungen aus {regulation}. "
|
|
guidance += f"Dokumentiere die Umsetzung und f\u00fchre regelm\u00e4\u00dfige Reviews durch."
|
|
|
|
# Determine if automated
|
|
is_automated = any(kw in all_text for kw in ["automat", "tool", "scan", "test"])
|
|
|
|
control = Control(
|
|
id=control_id,
|
|
domain=domain,
|
|
title=title,
|
|
description=description,
|
|
checkpoints=[cp.id for cp in checkpoints],
|
|
pass_criteria=pass_criteria,
|
|
implementation_guidance=guidance,
|
|
is_automated=is_automated,
|
|
automation_tool="CI/CD Pipeline" if is_automated else None,
|
|
priority="high" if "muss" in all_text or "erforderlich" in all_text else "medium"
|
|
)
|
|
|
|
return control
|
|
|
|
|
|
def generate_measure_for_control(control: Control) -> Measure:
|
|
"""Generate a remediation measure for a control."""
|
|
measure_id = f"M-{control.id}"
|
|
|
|
# Determine deadline based on priority
|
|
deadline_days = {
|
|
"critical": 30,
|
|
"high": 60,
|
|
"medium": 90,
|
|
"low": 180
|
|
}.get(control.priority, 90)
|
|
|
|
# Determine responsible team
|
|
responsible = {
|
|
"priv": "Datenschutzbeauftragter",
|
|
"iam": "IT-Security Team",
|
|
"sdlc": "Entwicklungsteam",
|
|
"crypto": "IT-Security Team",
|
|
"ops": "Operations Team",
|
|
"aud": "Compliance Team",
|
|
"ai": "AI/ML Team",
|
|
"cra": "IT-Security Team",
|
|
"gov": "Management"
|
|
}.get(control.domain, "Compliance Team")
|
|
|
|
measure = Measure(
|
|
id=measure_id,
|
|
control_id=control.id,
|
|
title=f"Umsetzung: {control.title[:50]}",
|
|
description=f"Implementierung und Dokumentation von {control.id}: {control.description[:100]}",
|
|
responsible=responsible,
|
|
deadline_days=deadline_days,
|
|
status="pending"
|
|
)
|
|
|
|
return measure
|