"""Decomposition Pass — Split Rich Controls into Atomic Controls. Pass 0 of the Multi-Layer Control Architecture migration. Runs BEFORE Passes 1-5 (obligation linkage, pattern classification, etc.). Two sub-passes: Pass 0a: Obligation Extraction — extract individual normative obligations from a Rich Control using LLM with strict guardrails. Pass 0b: Atomic Control Composition — turn each obligation candidate into a standalone atomic control record. Plus a Quality Gate that validates extraction results. Guardrails (the 6 rules): 1. Only normative statements (müssen, sicherzustellen, verpflichtet, ...) 2. One main verb per obligation 3. Test obligations separate from operational obligations 4. Reporting obligations separate 5. Don't split at evidence level 6. Parent link always preserved """ import json import logging import os import re import uuid from dataclasses import dataclass, field from typing import Optional import httpx from sqlalchemy import text from sqlalchemy.orm import Session logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # LLM Provider Config # --------------------------------------------------------------------------- ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") ANTHROPIC_MODEL = os.getenv("DECOMPOSITION_LLM_MODEL", "claude-haiku-4-5-20251001") DECOMPOSITION_BATCH_SIZE = int(os.getenv("DECOMPOSITION_BATCH_SIZE", "5")) LLM_TIMEOUT = float(os.getenv("DECOMPOSITION_LLM_TIMEOUT", "120")) ANTHROPIC_API_URL = "https://api.anthropic.com/v1" # --------------------------------------------------------------------------- # Normative signal detection — 3-Tier Classification # --------------------------------------------------------------------------- # Tier 1: Pflicht (mandatory) — strong normative signals # Tier 2: Empfehlung (recommendation) — weaker normative signals # Tier 3: Kann (optional/permissive) — permissive signals # Nothing is rejected — everything is classified. # # Patterns are defined in normative_patterns.py and imported here # with local aliases for backward compatibility. from .normative_patterns import ( PFLICHT_RE as _PFLICHT_RE, EMPFEHLUNG_RE as _EMPFEHLUNG_RE, KANN_RE as _KANN_RE, NORMATIVE_RE as _NORMATIVE_RE, RATIONALE_RE as _RATIONALE_RE, ) _TEST_SIGNALS = [ r"\btesten\b", r"\btest\b", r"\bprüfung\b", r"\bprüfen\b", r"\bgetestet\b", r"\bwirksamkeit\b", r"\baudit\b", r"\bregelmäßig\b.*\b(prüf|test|kontroll)", r"\beffectiveness\b", r"\bverif", ] _TEST_RE = re.compile("|".join(_TEST_SIGNALS), re.IGNORECASE) _REPORTING_SIGNALS = [ r"\bmelden\b", r"\bmeldung\b", r"\bunterricht", r"\binformieren\b", r"\bbenachricht", r"\bnotif", r"\breport\b", r"\bbehörd", ] _REPORTING_RE = re.compile("|".join(_REPORTING_SIGNALS), re.IGNORECASE) # --------------------------------------------------------------------------- # Merge & Enrichment helpers # --------------------------------------------------------------------------- # Trigger-type detection patterns _EVENT_TRIGGERS = re.compile( r"\b(vorfall|incident|breach|verletzung|sicherheitsvorfall|meldung|entdeckung" r"|feststellung|erkennung|ereignis|eintritt|bei\s+auftreten|im\s+falle" r"|wenn\s+ein|sobald|unverzüglich|upon|in\s+case\s+of|when\s+a)\b", re.IGNORECASE, ) _PERIODIC_TRIGGERS = re.compile( r"\b(jährlich|monatlich|quartalsweise|regelmäßig|periodisch|annually" r"|monthly|quarterly|periodic|mindestens\s+(einmal|alle)|turnusmäßig" r"|wiederkehrend|in\s+regelmäßigen\s+abständen)\b", re.IGNORECASE, ) # Implementation-specific keywords (concrete tools/protocols/formats) _IMPL_SPECIFIC_PATTERNS = re.compile( r"\b(TLS|SSL|AES|RSA|SHA-\d|HTTPS|LDAP|SAML|OAuth|OIDC|MFA|2FA" r"|SIEM|IDS|IPS|WAF|VPN|VLAN|DMZ|HSM|PKI|RBAC|ABAC" r"|ISO\s*27\d{3}|SOC\s*2|PCI[\s-]DSS|NIST" r"|Firewall|Antivirus|EDR|XDR|SOAR|DLP" r"|SMS|E-Mail|Fax|Telefon" r"|JSON|XML|CSV|PDF|YAML" r"|PostgreSQL|MySQL|MongoDB|Redis|Kafka" r"|Docker|Kubernetes|AWS|Azure|GCP" r"|Active\s*Directory|RADIUS|Kerberos" r"|RSyslog|Splunk|ELK|Grafana|Prometheus" r"|Git|Jenkins|Terraform|Ansible)\b", re.IGNORECASE, ) def _classify_trigger_type(obligation_text: str, condition: str) -> str: """Classify when an obligation is triggered: event/periodic/continuous.""" combined = f"{obligation_text} {condition}" if _EVENT_TRIGGERS.search(combined): return "event" if _PERIODIC_TRIGGERS.search(combined): return "periodic" return "continuous" def _is_implementation_specific_text( obligation_text: str, action: str, obj: str ) -> bool: """Check if an obligation references concrete implementation details.""" combined = f"{obligation_text} {action} {obj}" matches = _IMPL_SPECIFIC_PATTERNS.findall(combined) return len(matches) >= 1 def _text_similar(a: str, b: str, threshold: float = 0.75) -> bool: """Quick token-overlap similarity check (Jaccard on words).""" if not a or not b: return False tokens_a = set(a.split()) tokens_b = set(b.split()) if not tokens_a or not tokens_b: return False intersection = tokens_a & tokens_b union = tokens_a | tokens_b return len(intersection) / len(union) >= threshold def _is_more_implementation_specific(text_a: str, text_b: str) -> bool: """Return True if text_a is more implementation-specific than text_b.""" matches_a = len(_IMPL_SPECIFIC_PATTERNS.findall(text_a)) matches_b = len(_IMPL_SPECIFIC_PATTERNS.findall(text_b)) if matches_a != matches_b: return matches_a > matches_b # Tie-break: longer text is usually more specific return len(text_a) > len(text_b) # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class ObligationCandidate: """A single normative obligation extracted from a Rich Control.""" candidate_id: str = "" parent_control_uuid: str = "" obligation_text: str = "" action: str = "" object_: str = "" condition: Optional[str] = None normative_strength: str = "must" obligation_type: str = "pflicht" # pflicht | empfehlung | kann is_test_obligation: bool = False is_reporting_obligation: bool = False extraction_confidence: float = 0.0 quality_flags: dict = field(default_factory=dict) release_state: str = "extracted" def to_dict(self) -> dict: return { "candidate_id": self.candidate_id, "parent_control_uuid": self.parent_control_uuid, "obligation_text": self.obligation_text, "action": self.action, "object": self.object_, "condition": self.condition, "normative_strength": self.normative_strength, "obligation_type": self.obligation_type, "is_test_obligation": self.is_test_obligation, "is_reporting_obligation": self.is_reporting_obligation, "extraction_confidence": self.extraction_confidence, "quality_flags": self.quality_flags, "release_state": self.release_state, } @dataclass class AtomicControlCandidate: """An atomic control composed from a single ObligationCandidate.""" candidate_id: str = "" parent_control_uuid: str = "" obligation_candidate_id: str = "" title: str = "" objective: str = "" requirements: list = field(default_factory=list) test_procedure: list = field(default_factory=list) evidence: list = field(default_factory=list) severity: str = "medium" category: str = "" domain: str = "" source_regulation: str = "" source_article: str = "" def to_dict(self) -> dict: return { "candidate_id": self.candidate_id, "parent_control_uuid": self.parent_control_uuid, "obligation_candidate_id": self.obligation_candidate_id, "title": self.title, "objective": self.objective, "requirements": self.requirements, "test_procedure": self.test_procedure, "evidence": self.evidence, "severity": self.severity, "category": self.category, "domain": self.domain, } # --------------------------------------------------------------------------- # Quality Gate # --------------------------------------------------------------------------- def classify_obligation_type(txt: str) -> str: """Classify obligation text into pflicht/empfehlung/kann. Priority: pflicht > empfehlung > kann > empfehlung (default). Nothing is rejected — obligations without normative signal default to 'empfehlung' (recommendation). """ if _PFLICHT_RE.search(txt): return "pflicht" if _EMPFEHLUNG_RE.search(txt): return "empfehlung" if _KANN_RE.search(txt): return "kann" # No signal at all — LLM thought it was an obligation, classify # as recommendation (the user can still use it). return "empfehlung" def quality_gate(candidate: ObligationCandidate) -> dict: """Validate an obligation candidate. Returns quality flags dict. Checks: has_normative_signal: text contains normative language (informational) obligation_type: pflicht | empfehlung | kann (classified, never rejected) single_action: only one main action (heuristic) not_rationale: not just a justification/reasoning not_evidence_only: not just an evidence requirement min_length: text is long enough to be meaningful has_parent_link: references back to parent control """ txt = candidate.obligation_text flags = {} # 1. Normative signal (informational — no longer used for rejection) flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(txt)) # 1b. Obligation type classification flags["obligation_type"] = classify_obligation_type(txt) # 2. Single action heuristic — count "und" / "and" / "sowie" splits # that connect different verbs (imperfect but useful) multi_verb_re = re.compile( r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren" r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b", re.IGNORECASE, ) flags["single_action"] = not bool(multi_verb_re.search(txt)) # 3. Not rationale normative_count = len(_NORMATIVE_RE.findall(txt)) rationale_count = len(_RATIONALE_RE.findall(txt)) flags["not_rationale"] = normative_count >= rationale_count # 4. Not evidence-only (evidence fragments are typically short noun phrases) evidence_only_re = re.compile( r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)", re.IGNORECASE, ) flags["not_evidence_only"] = not bool(evidence_only_re.match(txt.strip())) # 5. Min length flags["min_length"] = len(txt.strip()) >= 20 # 6. Parent link flags["has_parent_link"] = bool(candidate.parent_control_uuid) return flags def passes_quality_gate(flags: dict) -> bool: """Check if critical quality flags pass. Note: has_normative_signal is NO LONGER critical — obligations without normative signal are classified as 'empfehlung' instead of being rejected. """ critical = ["not_evidence_only", "min_length", "has_parent_link"] return all(flags.get(k, False) for k in critical) # --------------------------------------------------------------------------- # LLM Prompts # --------------------------------------------------------------------------- _PASS0A_SYSTEM_PROMPT = """\ Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \ in einzelne atomare Pflichten. ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!): 1. Identifiziere den Adressaten (Wer muss handeln?) 2. Identifiziere die Handlung (Was muss getan werden?) 3. Bestimme die normative Staerke (muss/soll/kann) 4. Pruefe ob Test- oder Meldepflicht vorliegt (separat erfassen!) 5. Formuliere jede Pflicht als eigenstaendiges JSON-Objekt REGELN (STRIKT EINHALTEN): 1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \ sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \ ist zu testen, shall, must, required. 2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung. 3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true). 4. Meldepflichten SEPARAT (is_reporting_obligation=true). 5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \ eigenes Control, sondern Evidence). 6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \ — NICHT extrahieren. Antworte NUR mit einem JSON-Array. Keine Erklärungen.""" def _build_pass0a_prompt( title: str, objective: str, requirements: str, test_procedure: str, source_ref: str ) -> str: return f"""\ Analysiere das folgende Control und extrahiere alle einzelnen normativen \ Pflichten als JSON-Array. CONTROL: Titel: {title} Ziel: {objective} Anforderungen: {requirements} Prüfverfahren: {test_procedure} Quellreferenz: {source_ref} Antworte als JSON-Array: [ {{ "obligation_text": "Kurze, präzise Formulierung der Pflicht", "action": "Hauptverb/Handlung", "object": "Gegenstand der Pflicht", "condition": "Auslöser/Bedingung oder null", "normative_strength": "must", "is_test_obligation": false, "is_reporting_obligation": false }} ]""" _PASS0B_SYSTEM_PROMPT = """\ Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \ normativen Pflicht ein praxisorientiertes, atomares Security Control. ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!): 1. Identifiziere die konkrete Anforderung aus der Pflicht 2. Leite eine umsetzbare technische/organisatorische Massnahme ab 3. Definiere ein Pruefverfahren (wie wird Umsetzung verifiziert?) 4. Bestimme den Nachweis (welches Dokument/Artefakt belegt Compliance?) Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase. Antworte NUR als JSON. Keine Erklärungen.""" # --------------------------------------------------------------------------- # Deterministic Atomic Control Composition Engine v2 # --------------------------------------------------------------------------- # Transforms obligation candidates into atomic controls WITHOUT LLM. # # Pipeline: # 1. split_compound_action() — split "erstellen und implementieren" → 2 # 2. classify_action() — 18 fine-grained action types # 3. classify_object() — policy / technical / process / register / … # 4. trigger_qualifier() — periodic / event / continuous → timing text # 5. template lookup — (action_type, object_class) → test + evidence # 6. compose — assemble AtomicControlCandidate # --------------------------------------------------------------------------- # ── 1. Compound Action Splitter ────────────────────────────────────────── _COMPOUND_SPLIT_RE = re.compile( r"\s+(?:und|sowie|als\s+auch|,\s*(?:und|sowie))\s+", re.IGNORECASE ) # Phrases that should never be split (stylistic variants, not separate actions) _NO_SPLIT_PHRASES: set[str] = { "pflegen und aufrechterhalten", "aufrechterhalten und pflegen", "erkennen und verhindern", "verhindern und erkennen", "sichern und schützen", "schützen und sichern", "schützen und absichern", "absichern und schützen", "ermitteln und bewerten", "bewerten und ermitteln", "prüfen und überwachen", "überwachen und prüfen", } def _split_compound_action(action: str) -> list[str]: """Split compound actions into individual sub-actions. Only splits if: - the parts map to *different* action types - the phrase is not in the no-split list Keeps phrases like 'aufrechterhalten und pflegen' together because both map to 'maintain'. """ if not action: return [action] # Check no-split list first if action.lower().strip() in _NO_SPLIT_PHRASES: return [action] parts = _COMPOUND_SPLIT_RE.split(action.strip()) if len(parts) <= 1: return [action] # Classify each part — only split if types differ types = [_classify_action(p.strip()) for p in parts] if len(set(types)) > 1: return [p.strip() for p in parts if p.strip()] return [action] # ── 2. Action Type Classification (18 types) ──────────────────────────── _ACTION_PRIORITY = [ "prevent", "exclude", "forbid", "implement", "configure", "encrypt", "restrict_access", "enforce", "invalidate", "issue", "rotate", "monitor", "review", "assess", "audit", "test", "verify", "validate", "report", "notify", "train", "delete", "retain", "ensure", "define", "document", "maintain", "approve", "remediate", "perform", "obtain", ] _ACTION_KEYWORDS: list[tuple[str, str]] = [ # ── Negative / prohibitive actions (highest priority) ──── ("dürfen keine", "prevent"), ("dürfen nicht", "prevent"), ("darf keine", "prevent"), ("darf nicht", "prevent"), ("nicht zulässig", "forbid"), ("nicht erlaubt", "forbid"), ("nicht gestattet", "forbid"), ("untersagt", "forbid"), ("verboten", "forbid"), ("nicht enthalten", "exclude"), ("nicht übertragen", "prevent"), ("nicht übermittelt", "prevent"), ("nicht wiederverwendet", "prevent"), ("nicht gespeichert", "prevent"), ("verhindern", "prevent"), ("unterbinden", "prevent"), ("ausschließen", "exclude"), ("vermeiden", "prevent"), ("ablehnen", "exclude"), ("zurückweisen", "exclude"), # ── Session / lifecycle actions ────────────────────────── ("ungültig machen", "invalidate"), ("invalidieren", "invalidate"), ("widerrufen", "invalidate"), ("session beenden", "invalidate"), ("vergeben", "issue"), ("ausstellen", "issue"), ("erzeugen", "issue"), ("generieren", "issue"), ("rotieren", "rotate"), ("erneuern", "rotate"), ("durchsetzen", "enforce"), ("erzwingen", "enforce"), # ── Multi-word patterns (longest match wins) ───────────── ("aktuell halten", "maintain"), ("aufrechterhalten", "maintain"), ("sicherstellen", "ensure"), ("gewährleisten", "ensure"), ("benachrichtigen", "notify"), ("sensibilisieren", "train"), ("authentifizieren", "restrict_access"), ("verschlüsseln", "encrypt"), ("implementieren", "implement"), ("konfigurieren", "configure"), ("bereitstellen", "implement"), ("protokollieren", "document"), ("dokumentieren", "document"), ("kontrollieren", "monitor"), ("installieren", "implement"), ("autorisieren", "restrict_access"), ("beschränken", "restrict_access"), ("berechtigen", "restrict_access"), ("aufbewahren", "retain"), ("archivieren", "retain"), ("überwachen", "monitor"), ("überprüfen", "review"), ("auditieren", "audit"), ("informieren", "notify"), ("analysieren", "assess"), ("verifizieren", "verify"), ("validieren", "validate"), ("evaluieren", "assess"), ("integrieren", "implement"), ("aktivieren", "configure"), ("einrichten", "configure"), ("einführen", "implement"), ("unterweisen", "train"), ("durchführen", "perform"), ("verarbeiten", "perform"), ("vernichten", "delete"), ("entfernen", "delete"), ("absichern", "implement"), ("schützen", "implement"), ("bewerten", "assess"), ("umsetzen", "implement"), ("aufbauen", "implement"), ("erstellen", "document"), ("definieren", "define"), ("festlegen", "define"), ("vorgeben", "define"), ("verfassen", "document"), ("einholen", "obtain"), ("genehmigen", "approve"), ("freigeben", "approve"), ("zulassen", "approve"), ("beheben", "remediate"), ("korrigieren", "remediate"), ("beseitigen", "remediate"), ("nachbessern", "remediate"), ("speichern", "retain"), ("mitteilen", "notify"), ("berichten", "report"), ("schulen", "train"), ("melden", "report"), ("prüfen", "review"), ("testen", "test"), ("führen", "document"), ("pflegen", "maintain"), ("wahren", "maintain"), ("löschen", "delete"), ("angeben", "document"), ("beifügen", "document"), # English fallbacks ("implement", "implement"), ("configure", "configure"), ("establish", "define"), ("document", "document"), ("maintain", "maintain"), ("monitor", "monitor"), ("review", "review"), ("assess", "assess"), ("audit", "audit"), ("encrypt", "encrypt"), ("restrict", "restrict_access"), ("authorize", "restrict_access"), ("verify", "verify"), ("validate", "validate"), ("report", "report"), ("notify", "notify"), ("train", "train"), ("test", "test"), ("delete", "delete"), ("retain", "retain"), ("ensure", "ensure"), ("approve", "approve"), ("remediate", "remediate"), ("perform", "perform"), ("obtain", "obtain"), ("prevent", "prevent"), ("forbid", "forbid"), ("exclude", "exclude"), ("invalidate", "invalidate"), ("revoke", "invalidate"), ("issue", "issue"), ("generate", "issue"), ("rotate", "rotate"), ("enforce", "enforce"), ] def _classify_action(action: str) -> str: """Classify an obligation action string into one of 18 action types. For compound actions, returns the highest-priority matching type. """ if not action: return "default" action_lower = action.lower().strip() matches: set[str] = set() for keyword, atype in _ACTION_KEYWORDS: if keyword in action_lower: matches.add(atype) if not matches: return "default" for prio in _ACTION_PRIORITY: if prio in matches: return prio return next(iter(matches)) # ── 3. Object Class Classification ────────────────────────────────────── _OBJECT_CLASS_KEYWORDS: dict[str, list[str]] = { # ── Governance / Documentation ──────────────────────────── "policy": [ "richtlinie", "policy", "konzept", "strategie", "leitlinie", "vorgabe", "regelung", "ordnung", "anweisung", "standard", "rahmenwerk", "sicherheitskonzept", "datenschutzkonzept", ], "procedure": [ "verfahren", "workflow", "ablauf", "vorgehensweise", "methodik", "prozedur", "handlungsanweisung", ], "register": [ "verzeichnis", "register", "inventar", "liste", "katalog", "übersicht", "bestandsaufnahme", ], "record": [ "protokoll", "log", "aufzeichnung", "nachweis", "evidenz", "artefakt", "dokumentation", ], "report": [ "meldung", "bericht", "report", "benachrichtigung", "mitteilung", "anzeige", "statusbericht", ], # ── Technical / Security ────────────────────────────────── "technical_control": [ "mfa", "firewall", "verschlüsselung", "backup", "antivirus", "ids", "ips", "waf", "vpn", "tls", "ssl", "patch", "update", "härtung", "segmentierung", "alarmierung", "monitoring", ], "access_control": [ "authentifizierung", "autorisierung", "zugriff", "berechtigung", "passwort", "kennwort", "anmeldung", "sso", "rbac", ], "session": [ "session", "sitzung", "sitzungsverwaltung", "session management", "session-id", "session-token", "idle timeout", "inaktivitäts-timeout", "inaktivitätszeitraum", "logout", "abmeldung", ], "cookie": [ "cookie", "session-cookie", "secure-flag", "httponly", "samesite", "cookie-attribut", ], "jwt": [ "jwt", "json web token", "bearer token", "jwt-algorithmus", "jwt-signatur", ], "federated_assertion": [ "assertion", "saml", "oidc", "openid", "föderiert", "federated", "identity provider", ], "cryptographic_control": [ "schlüssel", "zertifikat", "signatur", "kryptographi", "cipher", "hash", "token", "entropie", ], "configuration": [ "konfiguration", "einstellung", "parameter", "baseline", "hardening", "härtungsprofil", ], "account": [ "account", "konto", "benutzer", "privilegiert", "admin", "root", "dienstkonto", "servicekonto", ], # ── Data / Systems ──────────────────────────────────────── "system": [ "system", "plattform", "dienst", "service", "anwendung", "software", "komponente", "infrastruktur", "netzwerk", "server", "datenbank", "produkt", "gerät", "endgerät", ], "data": [ "daten", "information", "personenbezogen", "datensatz", "datei", "inhalt", "verarbeitungstätigkeit", ], "interface": [ "schnittstelle", "interface", "api", "integration", "datenfluss", "datenübermittlung", ], # ── People / Org ────────────────────────────────────────── "role": [ "mitarbeiter", "personal", "rolle", "beauftragter", "verantwortlicher", "team", "abteilung", "beschäftigte", ], "training": [ "schulung", "training", "sensibilisierung", "awareness", "unterweisung", "fortbildung", "qualifikation", ], # ── Incident / Risk ─────────────────────────────────────── "incident": [ "vorfall", "incident", "sicherheitsvorfall", "störung", "notfall", "krise", "bedrohung", ], "risk_artifact": [ "risiko", "schwachstelle", "vulnerability", "gefährdung", "risikoanalyse", "risikobewertung", "schutzbedarfsfeststellung", ], # ── Process / Consent ──────────────────────────────────── "process": [ "prozess", "geschäftsprozess", "betriebsprozess", "managementprozess", "steuerungsprozess", ], "consent": [ "einwilligung", "consent", "einverständnis", "zustimmung", "opt-in", "opt-out", ], } def _classify_object(object_: str) -> str: """Classify the obligation object into a domain class.""" if not object_: return "general" obj_lower = object_.lower() for obj_class, keywords in _OBJECT_CLASS_KEYWORDS.items(): if any(k in obj_lower for k in keywords): return obj_class return "general" # ── 4. Trigger / Timing Qualifier ─────────────────────────────────────── _FREQUENCY_PATTERNS: list[tuple[str, str]] = [ (r"jährl", "jährlich"), (r"quartal", "quartalsweise"), (r"halbjähr", "halbjährlich"), (r"monatl", "monatlich"), (r"wöchentl", "wöchentlich"), (r"regelmäßig", "regelmässig"), (r"72\s*Stunden", "innerhalb von 72 Stunden"), (r"unverzüglich", "unverzüglich"), (r"ohne\s+Verzögerung", "ohne unangemessene Verzögerung"), (r"vor\s+Inbetriebnahme", "vor Inbetriebnahme"), (r"vor\s+Markteinführung", "vor Markteinführung"), (r"vor\s+Freigabe", "vor Freigabe"), ] def _extract_trigger_qualifier( trigger_type: Optional[str], obligation_text: str, ) -> str: """Extract timing/trigger context for test procedures.""" # Try to find specific frequency in obligation text for pattern, qualifier in _FREQUENCY_PATTERNS: if re.search(pattern, obligation_text, re.IGNORECASE): return qualifier if trigger_type == "event": return "bei Eintreten des auslösenden Ereignisses" if trigger_type == "periodic": return "periodisch" return "" # continuous → no special qualifier # ── 4b. Structured Timing Extraction ──────────────────────────────────── _STRUCTURED_FREQUENCY_MAP: list[tuple[str, Optional[int], Optional[str]]] = [ # (regex_pattern, deadline_hours, frequency) (r"72\s*Stunden", 72, None), (r"48\s*Stunden", 48, None), (r"24\s*Stunden", 24, None), (r"unverzüglich", 0, None), (r"ohne\s+Verzögerung", 0, None), (r"sofort", 0, None), (r"jährl", None, "yearly"), (r"halbjähr", None, "semi_annually"), (r"quartal", None, "quarterly"), (r"monatl", None, "monthly"), (r"wöchentl", None, "weekly"), (r"täglich", None, "daily"), (r"regelmäßig", None, "periodic"), (r"periodisch", None, "periodic"), (r"vor\s+Inbetriebnahme", None, "before_deployment"), (r"vor\s+Markteinführung", None, "before_launch"), (r"vor\s+Freigabe", None, "before_release"), ] def _extract_structured_timing( obligation_text: str, ) -> tuple[Optional[int], Optional[str]]: """Extract deadline_hours and frequency from obligation text. Returns (deadline_hours, frequency). Both may be None. """ for pattern, deadline, freq in _STRUCTURED_FREQUENCY_MAP: if re.search(pattern, obligation_text, re.IGNORECASE): return (deadline, freq) return (None, None) # ── 5. Template Matrix: (action_type, object_class) → templates ───────── # # Specific combos override base templates. Lookup order: # 1. _SPECIFIC_TEMPLATES[(action_type, object_class)] # 2. _ACTION_TEMPLATES[action_type] # 3. _DEFAULT_ACTION_TEMPLATE _ACTION_TEMPLATES: dict[str, dict[str, list[str]]] = { # ── Create / Define / Document ───────────────────────────── "define": { "test_procedure": [ "Prüfung, ob {object} definiert und formal freigegeben ist", "Review der Inhalte auf Vollständigkeit und Angemessenheit", "Verifizierung, dass {object} den Betroffenen kommuniziert wurde", ], "evidence": [ "Freigegebenes Dokument mit Geltungsbereich", "Kommunikationsnachweis (E-Mail, Intranet, Schulung)", ], }, "document": { "test_procedure": [ "Prüfung, ob {object} dokumentiert und aktuell ist", "Sichtung der Dokumentation auf Vollständigkeit", "Verifizierung der Versionierung und des Review-Zyklus", ], "evidence": [ "Dokument mit Versionshistorie", "Freigabenachweis (Unterschrift/Approval)", ], }, "maintain": { "test_procedure": [ "Prüfung, ob {object} aktuell gehalten wird", "Vergleich der letzten Aktualisierung mit dem Review-Zyklus", "Stichprobe: Änderungen nach relevanten Ereignissen nachvollzogen", ], "evidence": [ "Änderungshistorie mit Datum und Verantwortlichem", "Nachweis des letzten Reviews", ], }, # ── Implement / Configure ────────────────────────────────── "implement": { "test_procedure": [ "Prüfung der technischen Umsetzung von {object}", "Funktionstest der implementierten Massnahme", "Review der Konfiguration gegen die Anforderungsspezifikation", ], "evidence": [ "Konfigurationsnachweis (Screenshot/Export)", "Implementierungsdokumentation", ], }, "configure": { "test_procedure": [ "Prüfung der Konfiguration von {object} gegen Soll-Vorgaben", "Vergleich mit Hardening-Baseline oder Best Practice", "Automatisierter Konfigurationsscan (falls verfügbar)", ], "evidence": [ "Konfigurationsexport mit Soll-/Ist-Vergleich", "Scan-Bericht oder Compliance-Check-Ergebnis", ], }, # ── Monitor / Review / Assess / Audit ────────────────────── "monitor": { "test_procedure": [ "Prüfung der laufenden Überwachung von {object}", "Stichprobe der Protokolle/Logs der letzten 3 Monate", "Verifizierung der Alarmierungs- und Eskalationsprozesse", ], "evidence": [ "Monitoring-Dashboard-Export oder Log-Auszüge", "Alarmierungsregeln und Eskalationsmatrix", ], }, "review": { "test_procedure": [ "Prüfung, ob {object} im vorgesehenen Zyklus überprüft wurde", "Sichtung des Review-Protokolls auf Massnahmenableitung", "Verifizierung der Umsetzung identifizierter Massnahmen", ], "evidence": [ "Review-Protokoll mit Datum und Teilnehmern", "Massnahmenplan mit Umsetzungsstatus", ], }, "assess": { "test_procedure": [ "Prüfung der Bewertungsmethodik für {object}", "Sichtung der letzten Bewertungsergebnisse", "Verifizierung, dass Ergebnisse in Massnahmen überführt wurden", ], "evidence": [ "Bewertungsbericht mit Methodik und Ergebnissen", "Abgeleiteter Massnahmenplan", ], }, "audit": { "test_procedure": [ "Prüfung des Audit-Plans und der Audit-Durchführung für {object}", "Sichtung der Audit-Berichte und Findings", "Verifizierung der Nachverfolgung offener Findings", ], "evidence": [ "Audit-Bericht mit Findings und Empfehlungen", "Finding-Tracker mit Umsetzungsstatus", ], }, # ── Test / Verify / Validate ─────────────────────────────── "test": { "test_procedure": [ "Review der Testpläne und -methodik für {object}", "Stichprobe der Testergebnisse und Massnahmenableitung", "Prüfung der Testabdeckung und -häufigkeit", ], "evidence": [ "Testprotokoll mit Ergebnissen", "Testplan und Abdeckungsbericht", ], }, "verify": { "test_procedure": [ "Prüfung der Verifikationsmethodik für {object}", "Nachvollzug der Verifikationsergebnisse gegen Spezifikation", "Prüfung, ob alle Anforderungen abgedeckt sind", ], "evidence": [ "Verifikationsbericht mit Soll-/Ist-Abgleich", "Anforderungs-Traceability-Matrix", ], }, "validate": { "test_procedure": [ "Prüfung der Validierungsmethodik für {object}", "Bewertung, ob die Massnahme den Zweck im Praxiseinsatz erfüllt", "Auswertung von Nutzerfeedback oder Betriebsdaten", ], "evidence": [ "Validierungsbericht", "Praxisnachweis (Betriebsdaten, Nutzerfeedback)", ], }, # ── Report / Notify ──────────────────────────────────────── "report": { "test_procedure": [ "Prüfung des Meldeprozesses für {object}", "Stichprobe gemeldeter Vorfälle auf Vollständigkeit und Fristeneinhaltung", "Verifizierung der Meldekanäle und Zuständigkeiten", ], "evidence": [ "Meldeprozess-Dokumentation", "Nachweise über erfolgte Meldungen mit Zeitstempeln", ], }, "notify": { "test_procedure": [ "Prüfung des Benachrichtigungsprozesses für {object}", "Verifizierung der Empfänger und Kommunikationskanäle", "Stichprobe: Benachrichtigungen fristgerecht versendet", ], "evidence": [ "Benachrichtigungsvorlagen und Verteiler", "Versandnachweise mit Zeitstempeln", ], }, # ── Train ────────────────────────────────────────────────── "train": { "test_procedure": [ "Prüfung der Schulungsinhalte und -unterlagen zu {object}", "Verifizierung der Teilnehmerlisten und Schulungsfrequenz", "Stichprobe: Wissensstand durch Befragung oder Kurztest", ], "evidence": [ "Schulungsunterlagen und Schulungsplan", "Teilnehmerlisten mit Datum und Unterschrift", "Ergebnisse von Wissenstests (falls durchgeführt)", ], }, # ── Access / Encrypt ─────────────────────────────────────── "restrict_access": { "test_procedure": [ "Review der Zugriffsberechtigungen für {object}", "Prüfung der Berechtigungsmatrix auf Aktualität und Least-Privilege", "Stichprobe: Entzug von Rechten bei Rollenwechsel/Austritt", ], "evidence": [ "Aktuelle Berechtigungsmatrix", "Zugriffsprotokolle der letzten 3 Monate", "Nachweis des letzten Berechtigungs-Reviews", ], }, "encrypt": { "test_procedure": [ "Prüfung der Verschlüsselungskonfiguration für {object}", "Verifizierung der Algorithmen und Schlüssellängen gegen BSI-Empfehlungen", "Prüfung des Schlüsselmanagement-Prozesses (Rotation, Speicherung)", ], "evidence": [ "Kryptographie-Konzept", "Zertifikats- und Schlüsselinventar", "Schlüsselrotations-Nachweis", ], }, # ── Delete / Retain ──────────────────────────────────────── "delete": { "test_procedure": [ "Prüfung des Löschkonzepts für {object}", "Verifizierung der Löschfristen und automatisierten Löschmechanismen", "Stichprobe: Löschung nach Fristablauf tatsächlich durchgeführt", ], "evidence": [ "Löschkonzept mit definierten Fristen", "Löschprotokolle oder -nachweise", ], }, "retain": { "test_procedure": [ "Prüfung der Aufbewahrungsfristen und -orte für {object}", "Verifizierung der Zugriffskontrollen auf archivierte Daten", "Prüfung der automatischen Löschung nach Ablauf der Aufbewahrungsfrist", ], "evidence": [ "Aufbewahrungsrichtlinie mit Fristen", "Speicherort-Dokumentation mit Zugriffskonzept", ], }, # ── Ensure (catch-all for sicherstellen/gewährleisten) ───── "ensure": { "test_procedure": [ "Prüfung, ob Massnahmen für {object} wirksam umgesetzt sind", "Stichprobenprüfung der Einhaltung im operativen Betrieb", "Review der zugehörigen Prozessdokumentation", ], "evidence": [ "Nachweis der Umsetzung (Konfiguration/Prozess)", "Prüfprotokoll der letzten Überprüfung", ], }, # ── Perform / Obtain ─────────────────────────────────────── "perform": { "test_procedure": [ "Prüfung der Durchführung von {object}", "Verifizierung der Zuständigkeiten und Freigabeschritte", "Stichprobe der Durchführung anhand aktueller Fälle", ], "evidence": [ "Durchführungsnachweise (Tickets, Protokolle)", "Prozessdokumentation mit Verantwortlichkeiten", ], }, "obtain": { "test_procedure": [ "Prüfung des Einholungsprozesses für {object}", "Verifizierung der Vollständigkeit und Gültigkeit", "Stichprobe: Einholung vor Beginn der Verarbeitung nachgewiesen", ], "evidence": [ "Nachweise der Einholung (Einwilligungen, Freigaben)", "Gültigkeitsprüfung mit Zeitstempeln", ], }, # ── Prevent / Exclude / Forbid (negative norms) ──────────── "prevent": { "test_procedure": [ "Prüfung, dass {object} technisch verhindert wird", "Stichprobe: Versuch der verbotenen Aktion schlägt fehl", "Review der Konfiguration und Zugriffskontrollen", ], "evidence": [ "Konfigurationsnachweis der Präventionsmassnahme", "Testprotokoll der Negativtests", ], }, "exclude": { "test_procedure": [ "Prüfung, dass {object} ausgeschlossen ist", "Stichprobe: Verbotene Inhalte/Aktionen sind nicht vorhanden", "Automatisierter Scan oder manuelle Prüfung", ], "evidence": [ "Scan-Ergebnis oder Prüfprotokoll", "Konfigurationsnachweis", ], }, "forbid": { "test_procedure": [ "Prüfung, dass {object} untersagt und technisch blockiert ist", "Verifizierung der Richtlinie und technischen Durchsetzung", "Stichprobe: Versuch der untersagten Aktion wird abgelehnt", ], "evidence": [ "Richtlinie mit explizitem Verbot", "Technischer Nachweis der Blockierung", ], }, # ── Enforce / Invalidate / Issue / Rotate ──────────────── "enforce": { "test_procedure": [ "Prüfung der technischen Durchsetzung von {object}", "Stichprobe: Nicht-konforme Konfigurationen werden automatisch korrigiert oder abgelehnt", "Review der Enforcement-Regeln und Ausnahmen", ], "evidence": [ "Enforcement-Policy mit technischer Umsetzung", "Protokoll erzwungener Korrekturen oder Ablehnungen", ], }, "invalidate": { "test_procedure": [ "Prüfung, dass {object} korrekt ungültig gemacht wird", "Stichprobe: Nach Invalidierung kein Zugriff mehr möglich", "Verifizierung der serverseitigen Bereinigung", ], "evidence": [ "Protokoll der Invalidierungsaktionen", "Testnachweis der Zugriffsverweigerung nach Invalidierung", ], }, "issue": { "test_procedure": [ "Prüfung des Vergabeprozesses für {object}", "Verifizierung der kryptographischen Sicherheit und Entropie", "Stichprobe: Korrekte Vergabe unter definierten Bedingungen", ], "evidence": [ "Prozessdokumentation der Vergabe", "Nachweis der Entropie-/Sicherheitseigenschaften", ], }, "rotate": { "test_procedure": [ "Prüfung des Rotationsprozesses für {object}", "Verifizierung der Rotationsfrequenz und automatischen Auslöser", "Stichprobe: Alte Artefakte nach Rotation ungültig", ], "evidence": [ "Rotationsrichtlinie mit Frequenz", "Rotationsprotokoll mit Zeitstempeln", ], }, # ── Approve / Remediate ─────────────────────────────────── "approve": { "test_procedure": [ "Prüfung des Genehmigungsprozesses für {object}", "Verifizierung der Freigabeberechtigungen und Eskalationswege", "Stichprobe: Genehmigung vor Umsetzung/Nutzung nachgewiesen", ], "evidence": [ "Freigabenachweis (Signatur, Ticket, Workflow)", "Genehmigungsmatrix mit Zuständigkeiten", ], }, "remediate": { "test_procedure": [ "Prüfung des Behebungsprozesses für {object}", "Verifizierung der Korrekturmassnahmen und Wirksamkeit", "Stichprobe: Abweichungen fristgerecht behoben", ], "evidence": [ "Korrekturmassnahmen-Dokumentation", "Nachprüfprotokoll der Wirksamkeit", ], }, } # ── Specific (action_type, object_class) overrides ─────────────────────── _SPECIFIC_TEMPLATES: dict[tuple[str, str], dict[str, list[str]]] = { ("implement", "policy"): { "test_procedure": [ "Prüfung, ob {object} dokumentiert, freigegeben und in relevanten Prozessen umgesetzt ist", "Interview mit Prozessverantwortlichen zur tatsächlichen Umsetzung", "Stichprobe: Nachweis der Umsetzung in der Praxis", ], "evidence": [ "Freigegebenes Richtliniendokument", "Nachweis der Kommunikation an Betroffene", "Stichproben der operativen Umsetzung", ], }, ("implement", "technical_control"): { "test_procedure": [ "Prüfung der technischen Konfiguration von {object}", "Funktionstest: Wirksamkeit der Massnahme verifizieren", "Vulnerability-Scan oder Penetrationstest (falls anwendbar)", ], "evidence": [ "Konfigurationsnachweis (Screenshot/Export)", "Testprotokoll mit Ergebnissen", "Scan-Bericht (falls durchgeführt)", ], }, ("implement", "process"): { "test_procedure": [ "Prüfung der Prozessdokumentation für {object}", "Verifizierung, dass der Prozess operativ gelebt wird", "Stichprobe: Prozessdurchführung anhand aktueller Fälle", ], "evidence": [ "Prozessdokumentation mit RACI-Matrix", "Durchführungsnachweise der letzten 3 Monate", ], }, ("define", "policy"): { "test_procedure": [ "Prüfung, ob {object} formal definiert und durch Management freigegeben ist", "Review des Geltungsbereichs und der Adressaten", "Verifizierung der regelmässigen Aktualisierung", ], "evidence": [ "Freigegebene Policy mit Unterschrift der Geschäftsleitung", "Geltungsbereich und Verteiler", "Letzte Aktualisierung mit Änderungshistorie", ], }, ("monitor", "system"): { "test_procedure": [ "Prüfung der Monitoring-Konfiguration für {object}", "Stichprobe der System-Logs und Alerts der letzten 3 Monate", "Verifizierung: Alerts führen zu dokumentierten Reaktionen", ], "evidence": [ "Monitoring-Dashboard-Export", "Alert-Konfiguration und Eskalationsregeln", "Incident-Tickets aus Alert-Eskalation", ], }, ("monitor", "incident"): { "test_procedure": [ "Prüfung des Incident-Monitoring-Prozesses für {object}", "Stichprobe erkannter Vorfälle auf Reaktionszeit", "Verifizierung der Eskalationswege", ], "evidence": [ "Incident-Log mit Erkennungs- und Reaktionszeiten", "Eskalationsmatrix", ], }, ("review", "policy"): { "test_procedure": [ "Prüfung, ob {object} im vorgesehenen Zyklus durch Management reviewed wurde", "Sichtung des Review-Protokolls auf Aktualisierungsbedarf", "Verifizierung, dass Änderungen umgesetzt wurden", ], "evidence": [ "Review-Protokoll mit Datum und Teilnehmern", "Aktualisierte Version der Richtlinie (falls geändert)", ], }, ("assess", "incident"): { "test_procedure": [ "Prüfung der Risikobewertung für {object}", "Nachvollzug der Bewertungskriterien (Schwere, Auswirkung, Wahrscheinlichkeit)", "Verifizierung der abgeleiteten Massnahmen", ], "evidence": [ "Risikobewertungs-Matrix", "Massnahmenplan mit Priorisierung", ], }, ("report", "incident"): { "test_procedure": [ "Prüfung des Meldeprozesses für {object} an zuständige Behörden", "Verifizierung der Meldefristen (z.B. 72h DSGVO, 24h NIS2)", "Stichprobe: Meldungen fristgerecht und vollständig", ], "evidence": [ "Meldeprozess mit Fristen und Zuständigkeiten", "Kopien erfolgter Behördenmeldungen", "Zeitstempel-Nachweis der Fristwahrung", ], }, ("train", "role"): { "test_procedure": [ "Prüfung der Schulungspflicht für {object}", "Verifizierung der Teilnahme aller betroffenen Personen", "Stichprobe: Wissensstand durch Kurztest oder Befragung", ], "evidence": [ "Schulungsplan mit Zielgruppen und Frequenz", "Teilnehmerlisten mit Datum und Unterschrift", "Testergebnisse oder Teilnahmebestätigungen", ], }, ("restrict_access", "data"): { "test_procedure": [ "Prüfung der Zugriffskontrollen für {object}", "Review der Berechtigungen nach Need-to-Know-Prinzip", "Stichprobe: Keine überprivilegierten Zugänge", ], "evidence": [ "Berechtigungsmatrix mit Rollen und Datenklassen", "Zugriffsprotokolle", "Ergebnis des letzten Access Reviews", ], }, ("restrict_access", "system"): { "test_procedure": [ "Prüfung der Zugriffskontrollen für {object}", "Review der Admin-/Privileged-Zugänge", "Stichprobe: MFA aktiv, Passwort-Policy eingehalten", ], "evidence": [ "Berechtigungsmatrix", "Audit-Log privilegierter Zugriffe", "MFA-Konfigurationsnachweis", ], }, ("encrypt", "data"): { "test_procedure": [ "Prüfung der Verschlüsselung von {object} at Rest und in Transit", "Verifizierung der Algorithmen gegen BSI TR-02102", "Prüfung des Key-Management-Prozesses", ], "evidence": [ "Kryptographie-Konzept mit Algorithmen und Schlüssellängen", "TLS-Konfigurationsnachweis", "Key-Rotation-Protokoll", ], }, ("delete", "data"): { "test_procedure": [ "Prüfung des Löschkonzepts für {object}", "Verifizierung der automatisierten Löschmechanismen", "Stichprobe: Löschung personenbezogener Daten nach Fristablauf", ], "evidence": [ "Löschkonzept mit Datenklassen und Fristen", "Löschprotokolle", "Nachweis der Vernichtung (bei physischen Medien)", ], }, ("retain", "data"): { "test_procedure": [ "Prüfung der Aufbewahrungsfristen für {object}", "Verifizierung der Speicherorte und Zugriffskontrollen", "Prüfung: Keine Aufbewahrung über gesetzliche Frist hinaus", ], "evidence": [ "Aufbewahrungsrichtlinie mit gesetzlichen Grundlagen", "Speicherort-Inventar mit Zugriffskonzept", ], }, ("obtain", "data"): { "test_procedure": [ "Prüfung des Einwilligungsprozesses für {object}", "Verifizierung der Einwilligungstexte auf Rechtskonformität", "Stichprobe: Einwilligung vor Verarbeitungsbeginn eingeholt", ], "evidence": [ "Einwilligungsformulare/-dialoge", "Consent-Log mit Zeitstempeln", "Widerrufsprozess-Dokumentation", ], }, } _DEFAULT_ACTION_TEMPLATE: dict[str, list[str]] = { "test_procedure": [ "Prüfung der Umsetzung von {object}", "Verifizierung der zugehörigen Dokumentation und Nachweisführung", ], "evidence": [ "Umsetzungsnachweis", "Zugehörige Dokumentation", ], } # ── 6. Title Suffix (action_type → past participle / state) ───────────── _ACTION_STATE_SUFFIX: dict[str, str] = { "define": "definiert und freigegeben", "document": "dokumentiert", "maintain": "aktuell gehalten", "implement": "umgesetzt", "configure": "konfiguriert", "monitor": "überwacht", "review": "überprüft", "assess": "bewertet", "audit": "auditiert", "test": "getestet", "verify": "verifiziert", "validate": "validiert", "report": "gemeldet", "notify": "benachrichtigt", "train": "geschult", "restrict_access": "zugriffsbeschränkt", "encrypt": "verschlüsselt", "delete": "gelöscht", "retain": "aufbewahrt", "ensure": "sichergestellt", "approve": "genehmigt", "remediate": "behoben", "perform": "durchgeführt", "obtain": "eingeholt", } # ── 6b. Pattern Candidates ────────────────────────────────────────────── _PATTERN_CANDIDATES_MAP: dict[tuple[str, str], list[str]] = { ("define", "policy"): ["policy_documented", "policy_approved"], ("document", "policy"): ["policy_documented"], ("implement", "technical_control"): ["technical_safeguard_enabled", "security_control_tested"], ("implement", "policy"): ["policy_implemented", "policy_communicated"], ("implement", "process"): ["process_established", "process_operational"], ("monitor", "system"): ["continuous_monitoring_active"], ("monitor", "incident"): ["incident_detection_active"], ("review", "policy"): ["policy_review_completed"], ("review", "risk_artifact"): ["risk_review_completed"], ("assess", "risk_artifact"): ["risk_assessment_completed"], ("restrict_access", "data"): ["access_control_enforced"], ("restrict_access", "system"): ["access_control_enforced", "privilege_management_active"], ("restrict_access", "access_control"): ["access_control_enforced"], ("encrypt", "data"): ["encryption_at_rest", "encryption_in_transit"], ("encrypt", "cryptographic_control"): ["encryption_at_rest", "key_management_active"], ("train", "role"): ["awareness_training_completed"], ("train", "training"): ["awareness_training_completed"], ("report", "incident"): ["incident_reported_timely"], ("notify", "incident"): ["notification_sent_timely"], ("delete", "data"): ["data_deletion_completed"], ("retain", "data"): ["data_retention_enforced"], ("audit", "system"): ["audit_completed"], ("test", "technical_control"): ["security_control_tested"], ("obtain", "consent"): ["consent_obtained"], ("obtain", "data"): ["consent_obtained"], ("approve", "policy"): ["policy_approved"], } _PATTERN_CANDIDATES_BY_ACTION: dict[str, list[str]] = { "define": ["policy_documented"], "document": ["policy_documented"], "implement": ["control_implemented"], "monitor": ["continuous_monitoring_active"], "review": ["review_completed"], "assess": ["assessment_completed"], "audit": ["audit_completed"], "test": ["security_control_tested"], "report": ["incident_reported_timely"], "notify": ["notification_sent_timely"], "train": ["awareness_training_completed"], "restrict_access": ["access_control_enforced"], "encrypt": ["encryption_at_rest"], "delete": ["data_deletion_completed"], "retain": ["data_retention_enforced"], "ensure": ["control_implemented"], "approve": ["policy_approved"], "remediate": ["remediation_completed"], "perform": ["activity_performed"], "obtain": ["consent_obtained"], "configure": ["technical_safeguard_enabled"], "verify": ["verification_completed"], "validate": ["validation_completed"], "maintain": ["control_maintained"], } # ── 6c. Raw Infinitives (for validator Negativregeln) ──────────────────── _RAW_INFINITIVES: set[str] = { "implementieren", "dokumentieren", "definieren", "konfigurieren", "überwachen", "überprüfen", "auditieren", "testen", "verifizieren", "validieren", "melden", "benachrichtigen", "schulen", "verschlüsseln", "löschen", "aufbewahren", "sicherstellen", "gewährleisten", "genehmigen", "beheben", "durchführen", "einholen", "erstellen", "festlegen", "bereitstellen", "installieren", "einrichten", "bewerten", "analysieren", "kontrollieren", "protokollieren", } # ── 7. Object Normalization (with synonym mapping) ────────────────────── _OBJECT_SYNONYMS: dict[str, str] = { "verzeichnis": "register", "inventar": "register", "katalog": "register", "bestandsaufnahme": "register", "richtlinie": "policy", "konzept": "policy", "strategie": "policy", "leitlinie": "policy", "vorgabe": "policy", "regelung": "policy", "anweisung": "policy", "rahmenwerk": "policy", "sicherheitskonzept": "policy", "datenschutzkonzept": "policy", "verfahren": "procedure", "ablauf": "procedure", "vorgehensweise": "procedure", "methodik": "procedure", "prozedur": "procedure", "handlungsanweisung": "procedure", "protokoll": "record", "aufzeichnung": "record", "nachweis": "record", "evidenz": "record", "vorfall": "incident", "störung": "incident", "sicherheitsvorfall": "incident", "notfall": "incident", "krise": "incident", "schwachstelle": "risk_artifact", "gefährdung": "risk_artifact", "risikoanalyse": "risk_artifact", "risikobewertung": "risk_artifact", "mitarbeiter": "role", "personal": "role", "beauftragter": "role", "verantwortlicher": "role", "schulung": "training", "sensibilisierung": "training", "unterweisung": "training", "verschlüsselung": "technical_control", "firewall": "technical_control", "backup": "technical_control", "meldung": "report", "bericht": "report", "benachrichtigung": "report", "berechtigung": "access_control", "authentifizierung": "access_control", "zugriff": "access_control", "einwilligung": "consent", "zustimmung": "consent", # Near-synonym expansions found via heavy-control analysis (2026-03-28) "erkennung": "detection", "früherkennung": "detection", "frühzeitige erkennung": "detection", "frühzeitigen erkennung": "detection", "detektion": "detection", "eskalation": "escalation", "eskalationsprozess": "escalation", "eskalationsverfahren": "escalation", "benachrichtigungsprozess": "notification", "benachrichtigungsverfahren": "notification", "meldeprozess": "notification", "meldeverfahren": "notification", "meldesystem": "notification", "benachrichtigungssystem": "notification", "überwachung": "monitoring", "monitoring": "monitoring", "kontinuierliche überwachung": "monitoring", "laufende überwachung": "monitoring", "prüfung": "audit", "überprüfung": "audit", "kontrolle": "control_check", "sicherheitskontrolle": "control_check", "dokumentation": "documentation", "aufzeichnungspflicht": "documentation", "protokollierung": "logging", "logführung": "logging", "logmanagement": "logging", "wiederherstellung": "recovery", "notfallwiederherstellung": "recovery", "disaster recovery": "recovery", "notfallplan": "contingency_plan", "notfallplanung": "contingency_plan", "wiederanlaufplan": "contingency_plan", "klassifizierung": "classification", "kategorisierung": "classification", "einstufung": "classification", "segmentierung": "segmentation", "netzwerksegmentierung": "segmentation", "netzwerk-segmentierung": "segmentation", "trennung": "segmentation", "isolierung": "isolation", "patch": "patch_mgmt", "patchmanagement": "patch_mgmt", "patch-management": "patch_mgmt", "aktualisierung": "patch_mgmt", "softwareaktualisierung": "patch_mgmt", "härtung": "hardening", "systemhärtung": "hardening", "härtungsmaßnahme": "hardening", "löschung": "deletion", "datenlöschung": "deletion", "löschkonzept": "deletion", "anonymisierung": "anonymization", "pseudonymisierung": "pseudonymization", "zugangssteuerung": "access_control", "zugangskontrolle": "access_control", "zugriffssteuerung": "access_control", "zugriffskontrolle": "access_control", "schlüsselmanagement": "key_mgmt", "schlüsselverwaltung": "key_mgmt", "key management": "key_mgmt", "zertifikatsverwaltung": "cert_mgmt", "zertifikatsmanagement": "cert_mgmt", "lieferant": "vendor", "dienstleister": "vendor", "auftragsverarbeiter": "vendor", "drittanbieter": "vendor", # Session management synonyms (2026-03-28) "sitzung": "session", "sitzungsverwaltung": "session_mgmt", "session management": "session_mgmt", "session-id": "session_token", "sitzungstoken": "session_token", "session-token": "session_token", "idle timeout": "session_timeout", "inaktivitäts-timeout": "session_timeout", "inaktivitätszeitraum": "session_timeout", "abmeldung": "logout", "cookie-attribut": "cookie_security", "secure-flag": "cookie_security", "httponly": "cookie_security", "samesite": "cookie_security", "json web token": "jwt", "bearer token": "jwt", "föderierte assertion": "federated_assertion", "saml assertion": "federated_assertion", } def _truncate_title(title: str, max_len: int = 80) -> str: """Truncate title at word boundary to avoid mid-word cuts.""" if len(title) <= max_len: return title truncated = title[:max_len] # Cut at last space to avoid mid-word truncation last_space = truncated.rfind(" ") if last_space > max_len // 2: return truncated[:last_space] return truncated def _normalize_object(object_raw: str) -> str: """Normalize object text to a snake_case key for merge hints. Applies synonym mapping to collapse German terms to canonical forms (e.g., 'Richtlinie' -> 'policy', 'Verzeichnis' -> 'register'). Then strips qualifying prepositional phrases that would create near-duplicate keys (e.g., 'bei Schwellenwertüberschreitung'). Truncates to 40 chars to collapse overly specific variants. """ if not object_raw: return "unknown" obj_lower = object_raw.strip().lower() # Strip qualifying prepositional phrases that don't change core identity. # These create near-duplicate keys like "eskalationsprozess" vs # "eskalationsprozess bei schwellenwertüberschreitung". obj_lower = _QUALIFYING_PHRASE_RE.sub("", obj_lower).strip() # Synonym mapping — find the longest matching synonym best_match = "" best_canonical = "" for synonym, canonical in _OBJECT_SYNONYMS.items(): if synonym in obj_lower and len(synonym) > len(best_match): best_match = synonym best_canonical = canonical if best_canonical: obj_lower = obj_lower.replace(best_match, best_canonical, 1) obj = re.sub(r"\s+", "_", obj_lower.strip()) for src, dst in [("ä", "ae"), ("ö", "oe"), ("ü", "ue"), ("ß", "ss")]: obj = obj.replace(src, dst) obj = re.sub(r"[^a-z0-9_]", "", obj) # Strip trailing noise tokens (articles/prepositions stuck at the end) obj = re.sub(r"(_(?:der|die|das|des|dem|den|fuer|bei|von|zur|zum|mit|auf|in|und|oder|aus|an|ueber|nach|gegen|unter|vor|zwischen|als|durch|ohne|wie))+$", "", obj) # Truncate at 40 chars (at underscore boundary) to collapse # overly specific suffixes that create near-duplicate keys. obj = _truncate_at_boundary(obj, 40) return obj or "unknown" # Regex to strip German qualifying prepositional phrases from object text. # Matches patterns like "bei schwellenwertüberschreitung", # "für kritische systeme", "gemäß artikel 32" etc. _QUALIFYING_PHRASE_RE = re.compile( r"\s+(?:" r"bei\s+\w+" r"|für\s+(?:die\s+|den\s+|das\s+|kritische\s+)?\w+" r"|gemäß\s+\w+" r"|nach\s+\w+" r"|von\s+\w+" r"|im\s+(?:falle?\s+|rahmen\s+)?\w+" r"|mit\s+(?:den\s+|der\s+|dem\s+)?\w+" r"|auf\s+(?:basis|grundlage)\s+\w+" r"|zur\s+(?:einhaltung|sicherstellung|gewährleistung|vermeidung|erfüllung)\s*\w*" r"|durch\s+(?:den\s+|die\s+|das\s+)?\w+" r"|über\s+(?:den\s+|die\s+|das\s+)?\w+" r"|unter\s+\w+" r"|zwischen\s+\w+" r"|innerhalb\s+\w+" r"|gegenüber\s+\w+" r"|hinsichtlich\s+\w+" r"|bezüglich\s+\w+" r"|einschließlich\s+\w+" r").*$", re.IGNORECASE, ) def _truncate_at_boundary(text: str, max_len: int) -> str: """Truncate text at the last underscore boundary within max_len.""" if len(text) <= max_len: return text truncated = text[:max_len] last_sep = truncated.rfind("_") if last_sep > max_len // 2: return truncated[:last_sep] return truncated # ── 7b. Framework / Composite Detection ────────────────────────────────── _FRAMEWORK_KEYWORDS: list[str] = [ "praktiken", "kontrollen gemäß", "maßnahmen gemäß", "anforderungen aus", "anforderungen gemäß", "gemäß .+ umzusetzen", "framework", "standard", "controls for", "practices for", "requirements from", ] _COMPOSITE_OBJECT_KEYWORDS: list[str] = [ "ccm", "nist", "iso 27001", "iso 27002", "owasp", "bsi", "cis controls", "cobit", "sox", "pci dss", "hitrust", "soc 2", "soc2", "enisa", "kritis", ] # Container objects that are too broad for atomic controls. # These produce titles like "Sichere Sitzungsverwaltung umgesetzt" which # are not auditable — they encompass multiple sub-requirements. _CONTAINER_OBJECT_KEYWORDS: list[str] = [ "sitzungsverwaltung", "session management", "session-management", "token-schutz", "tokenschutz", "authentifizierungsmechanismen", "authentifizierungsmechanismus", "sicherheitsmaßnahmen", "sicherheitsmassnahmen", "schutzmaßnahmen", "schutzmassnahmen", "zugriffskontrollmechanismen", "sicherheitsarchitektur", "sicherheitskontrollen", "datenschutzmaßnahmen", "datenschutzmassnahmen", "compliance-anforderungen", "risikomanagementprozess", ] _COMPOSITE_RE = re.compile( "|".join(_FRAMEWORK_KEYWORDS + _COMPOSITE_OBJECT_KEYWORDS), re.IGNORECASE, ) _CONTAINER_RE = re.compile( "|".join(_CONTAINER_OBJECT_KEYWORDS), re.IGNORECASE, ) def _is_composite_obligation(obligation_text: str, object_: str) -> bool: """Detect framework-level / composite obligations that are NOT atomic. Returns True if the obligation references a framework domain, standard, or set of practices rather than a single auditable requirement. """ combined = f"{obligation_text} {object_}" return bool(_COMPOSITE_RE.search(combined)) def _is_container_object(object_: str) -> bool: """Detect overly broad container objects that should not be atomic. Objects like 'Sitzungsverwaltung' or 'Token-Schutz' encompass multiple sub-requirements and produce non-auditable controls. """ if not object_: return False return bool(_CONTAINER_RE.search(object_)) # ── 7c. Output Validator (Negativregeln) ───────────────────────────────── def _validate_atomic_control( atomic: "AtomicControlCandidate", action_type: str, object_class: str, ) -> list[str]: """Validate an atomic control against Pflichtfelder + Negativregeln. Returns a list of issue strings (ERROR: / WARN:). Logs warnings but never rejects the control. """ issues: list[str] = [] # ── Pflichtfelder ────────────────────────────────────── if not atomic.title.strip(): issues.append("ERROR: title is empty") if not atomic.objective.strip(): issues.append("ERROR: objective is empty") if not atomic.test_procedure: issues.append("ERROR: test_procedure is empty") if not atomic.evidence: issues.append("ERROR: evidence is empty") # ── Negativregeln ────────────────────────────────────── if len(atomic.title) > 80: issues.append(f"ERROR: title exceeds 80 chars ({len(atomic.title)})") # Detect garbage pattern: "Prüfung der {raw_infinitive}" (leaked action) for i, tp in enumerate(atomic.test_procedure): for inf in _RAW_INFINITIVES: if re.search( rf"\b(?:der|des|die)\s+{re.escape(inf)}\b", tp, re.IGNORECASE, ): issues.append( f"ERROR: test_procedure[{i}] contains raw infinitive '{inf}'" ) break for i, ev in enumerate(atomic.evidence): if not ev.strip(): issues.append(f"ERROR: evidence[{i}] is empty string") # ── Warnregeln ───────────────────────────────────────── confidence = getattr(atomic, "_decomposition_confidence", None) if confidence is not None and confidence < 0.5: issues.append(f"WARN: low confidence ({confidence})") if object_class == "general": issues.append("WARN: object_class is 'general' (unclassified)") if getattr(atomic, "_is_composite", False): issues.append("WARN: composite/framework obligation — requires further decomposition") for issue in issues: if issue.startswith("ERROR:"): logger.warning("Validation: %s — title=%s", issue, atomic.title[:60]) else: logger.debug("Validation: %s — title=%s", issue, atomic.title[:60]) return issues # ── 8. Confidence Scoring ─────────────────────────────────────────────── def _score_pass0b_confidence( action_type: str, object_class: str, trigger_q: str, has_specific_template: bool, ) -> float: """Score decomposition confidence for a Pass 0b candidate.""" score = 0.3 # base if action_type != "default": score += 0.25 if object_class != "general": score += 0.20 if trigger_q: score += 0.10 if has_specific_template: score += 0.15 return round(min(score, 1.0), 2) # ── 9. Compose Function ───────────────────────────────────────────────── def _compose_deterministic( obligation_text: str, action: str, object_: str, parent_title: str, parent_severity: str, parent_category: str, is_test: bool, is_reporting: bool, trigger_type: Optional[str] = None, condition: Optional[str] = None, ) -> "AtomicControlCandidate": """Compose an atomic control deterministically from obligation data. No LLM required. Uses action-type classification, object-class matching, and trigger-aware templates. Generates: - Title as '{Object} {state suffix}' - Statement as '{condition_prefix} {object} ist {trigger} {action}' - Evidence/test bundles from (action_type, object_class) matrix - Pattern candidates for downstream categorization - Merge hint for downstream dedup - Structured timing (deadline_hours, frequency) - Confidence score - Validation issues (Negativregeln) """ # Override action type for flagged obligations if is_test: action_type = "test" elif is_reporting: action_type = "report" else: action_type = _classify_action(action) object_class = _classify_object(object_) # Template lookup: specific combo → action base → default has_specific = (action_type, object_class) in _SPECIFIC_TEMPLATES template = ( _SPECIFIC_TEMPLATES.get((action_type, object_class)) or _ACTION_TEMPLATES.get(action_type) or _DEFAULT_ACTION_TEMPLATE ) # Object for template substitution (fallback to parent title) obj_display = object_.strip() if object_ else parent_title # ── Title: "{Object} {Zustand}" ─────────────────────────── state = _ACTION_STATE_SUFFIX.get(action_type, "umgesetzt") if object_: title = _truncate_title(f"{object_.strip()} {state}") elif action: title = _truncate_title(f"{action.strip().capitalize()} {state}") else: title = _truncate_title(f"{parent_title} {state}") # ── Objective = obligation text (the normative statement) ─ objective = obligation_text.strip()[:2000] # ── Requirements = obligation as concrete requirement ───── requirements = [obligation_text.strip()] if obligation_text else [] # ── Test procedure from templates with object substitution test_procedure = [ tp.replace("{object}", obj_display) for tp in template["test_procedure"] ] # ── Trigger qualifier → add timing test step ────────────── trigger_q = _extract_trigger_qualifier(trigger_type, obligation_text) if trigger_q and test_procedure: test_procedure.append( f"Prüfung der Frist-/Trigger-Einhaltung: {trigger_q}" ) # ── Evidence from templates ─────────────────────────────── evidence = list(template["evidence"]) # ── Merge hint for downstream dedup ─────────────────────── norm_obj = _normalize_object(object_) trigger_key = trigger_type or "none" merge_hint = f"{action_type}:{norm_obj}:{trigger_key}" # ── Statement: structured normative sentence ────────────── condition_prefix = "" if condition and condition.strip(): condition_prefix = condition.strip().rstrip(",") + "," trigger_clause = trigger_q if trigger_q else "" obj_for_stmt = object_.strip() if object_ else parent_title if obj_for_stmt: parts = [p for p in [condition_prefix, obj_for_stmt, "ist", trigger_clause, state] if p] statement = " ".join(parts) else: statement = "" # ── Pattern candidates ──────────────────────────────────── pattern_candidates = _PATTERN_CANDIDATES_MAP.get( (action_type, object_class), _PATTERN_CANDIDATES_BY_ACTION.get(action_type, []), ) # ── Structured timing ───────────────────────────────────── deadline_hours, frequency = _extract_structured_timing(obligation_text) # ── Confidence score ────────────────────────────────────── confidence = _score_pass0b_confidence( action_type, object_class, trigger_q, has_specific, ) atomic = AtomicControlCandidate( title=title, objective=objective, requirements=requirements, test_procedure=test_procedure, evidence=evidence, severity=_calibrate_severity(parent_severity, action_type), category=parent_category or "governance", ) # Attach extra metadata (stored in generation_metadata) atomic.domain = f"{action_type}:{object_class}" atomic.source_regulation = merge_hint atomic._decomposition_confidence = confidence # type: ignore[attr-defined] atomic._statement = statement # type: ignore[attr-defined] atomic._pattern_candidates = list(pattern_candidates) # type: ignore[attr-defined] atomic._deadline_hours = deadline_hours # type: ignore[attr-defined] atomic._frequency = frequency # type: ignore[attr-defined] # ── Composite / Framework / Container detection ──────────── is_composite = _is_composite_obligation(obligation_text, object_) is_container = _is_container_object(object_) atomic._is_composite = is_composite or is_container # type: ignore[attr-defined] if is_composite: atomic._atomicity = "composite" # type: ignore[attr-defined] elif is_container: atomic._atomicity = "container" # type: ignore[attr-defined] else: atomic._atomicity = "atomic" # type: ignore[attr-defined] atomic._requires_decomposition = is_composite or is_container # type: ignore[attr-defined] # ── Validate (log issues, never reject) ─────────────────── validation_issues = _validate_atomic_control(atomic, action_type, object_class) atomic._validation_issues = validation_issues # type: ignore[attr-defined] return atomic def _build_pass0b_prompt( obligation_text: str, action: str, object_: str, parent_title: str, parent_category: str, source_ref: str, ) -> str: return f"""\ Erstelle aus der folgenden Pflicht ein atomares Control. PFLICHT: {obligation_text} HANDLUNG: {action} GEGENSTAND: {object_} KONTEXT (Ursprungs-Control): Titel: {parent_title} Kategorie: {parent_category} Quellreferenz: {source_ref} Antworte als JSON: {{ "title": "Kurzer Titel (max 80 Zeichen, deutsch)", "objective": "Was muss erreicht werden? (1-2 Sätze)", "requirements": ["Konkrete Anforderung 1", "Anforderung 2"], "test_procedure": ["Prüfschritt 1", "Prüfschritt 2"], "evidence": ["Nachweis 1", "Nachweis 2"], "severity": "critical|high|medium|low", "category": "security|privacy|governance|operations|finance|reporting" }}""" # --------------------------------------------------------------------------- # Batch Prompts (multiple controls/obligations per API call) # --------------------------------------------------------------------------- def _build_pass0a_batch_prompt(controls: list[dict]) -> str: """Build a prompt for extracting obligations from multiple controls. Each control dict needs: control_id, title, objective, requirements, test_procedure, source_ref. """ parts = [] for i, ctrl in enumerate(controls, 1): parts.append( f"--- CONTROL {i} (ID: {ctrl['control_id']}) ---\n" f"Titel: {ctrl['title']}\n" f"Ziel: {ctrl['objective']}\n" f"Anforderungen: {ctrl['requirements']}\n" f"Prüfverfahren: {ctrl['test_procedure']}\n" f"Quellreferenz: {ctrl['source_ref']}" ) controls_text = "\n\n".join(parts) ids_example = ", ".join(f'"{c["control_id"]}": [...]' for c in controls[:2]) return f"""\ Analysiere die folgenden {len(controls)} Controls und extrahiere aus JEDEM \ alle einzelnen normativen Pflichten. {controls_text} Antworte als JSON-Objekt. Fuer JEDES Control ein Key (die Control-ID) mit \ einem Array von Pflichten: {{ {ids_example} }} Jede Pflicht hat dieses Format: {{ "obligation_text": "Kurze, präzise Formulierung der Pflicht", "action": "Hauptverb/Handlung", "object": "Gegenstand der Pflicht", "condition": null, "normative_strength": "must", "is_test_obligation": false, "is_reporting_obligation": false }}""" def _build_pass0b_batch_prompt(obligations: list[dict]) -> str: """Build a prompt for composing multiple atomic controls. Each obligation dict needs: candidate_id, obligation_text, action, object, parent_title, parent_category, source_ref. """ parts = [] for i, obl in enumerate(obligations, 1): parts.append( f"--- PFLICHT {i} (ID: {obl['candidate_id']}) ---\n" f"PFLICHT: {obl['obligation_text']}\n" f"HANDLUNG: {obl['action']}\n" f"GEGENSTAND: {obl['object']}\n" f"KONTEXT: {obl['parent_title']} | {obl['parent_category']}\n" f"Quellreferenz: {obl['source_ref']}" ) obligations_text = "\n\n".join(parts) ids_example = ", ".join(f'"{o["candidate_id"]}": {{...}}' for o in obligations[:2]) return f"""\ Erstelle aus den folgenden {len(obligations)} Pflichten je ein atomares Control. {obligations_text} Antworte als JSON-Objekt. Fuer JEDE Pflicht ein Key (die Pflicht-ID): {{ {ids_example} }} Jedes Control hat dieses Format: {{ "title": "Kurzer Titel (max 80 Zeichen, deutsch)", "objective": "Was muss erreicht werden? (1-2 Sätze)", "requirements": ["Konkrete Anforderung 1", "Anforderung 2"], "test_procedure": ["Prüfschritt 1", "Prüfschritt 2"], "evidence": ["Nachweis 1", "Nachweis 2"], "severity": "critical|high|medium|low", "category": "security|privacy|governance|operations|finance|reporting" }}""" # --------------------------------------------------------------------------- # Anthropic API (with prompt caching) # --------------------------------------------------------------------------- async def _llm_anthropic( prompt: str, system_prompt: str, max_tokens: int = 8192, ) -> str: """Call Anthropic Messages API with prompt caching for system prompt.""" if not ANTHROPIC_API_KEY: raise RuntimeError("ANTHROPIC_API_KEY not set") headers = { "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01", "content-type": "application/json", } payload = { "model": ANTHROPIC_MODEL, "max_tokens": max_tokens, "system": [ { "type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}, } ], "messages": [{"role": "user", "content": prompt}], } try: async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client: resp = await client.post( f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, ) if resp.status_code != 200: logger.error( "Anthropic API %d: %s", resp.status_code, resp.text[:300] ) return "" data = resp.json() # Log cache performance usage = data.get("usage", {}) cached = usage.get("cache_read_input_tokens", 0) if cached > 0: logger.debug( "Prompt cache hit: %d cached tokens", cached ) content = data.get("content", []) if content and isinstance(content, list): return content[0].get("text", "") return "" except Exception as e: logger.error("Anthropic request failed: %s", e) return "" # --------------------------------------------------------------------------- # Anthropic Batch API (50% cost reduction, async processing) # --------------------------------------------------------------------------- async def create_anthropic_batch( requests: list[dict], ) -> dict: """Submit a batch of requests to Anthropic Batch API. Each request: {"custom_id": "...", "params": {model, max_tokens, system, messages}} Returns batch metadata including batch_id. """ if not ANTHROPIC_API_KEY: raise RuntimeError("ANTHROPIC_API_KEY not set") headers = { "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01", "content-type": "application/json", } async with httpx.AsyncClient(timeout=60) as client: resp = await client.post( f"{ANTHROPIC_API_URL}/messages/batches", headers=headers, json={"requests": requests}, ) if resp.status_code not in (200, 201): raise RuntimeError( f"Batch API failed {resp.status_code}: {resp.text[:500]}" ) return resp.json() async def check_batch_status(batch_id: str) -> dict: """Check the processing status of a batch.""" headers = { "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01", } async with httpx.AsyncClient(timeout=30) as client: resp = await client.get( f"{ANTHROPIC_API_URL}/messages/batches/{batch_id}", headers=headers, ) resp.raise_for_status() return resp.json() async def fetch_batch_results(batch_id: str) -> list[dict]: """Fetch results of a completed batch. Returns list of result objects.""" headers = { "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01", } async with httpx.AsyncClient(timeout=120) as client: resp = await client.get( f"{ANTHROPIC_API_URL}/messages/batches/{batch_id}/results", headers=headers, ) resp.raise_for_status() # Response is JSONL (one JSON object per line) results = [] for line in resp.text.strip().split("\n"): if line.strip(): results.append(json.loads(line)) return results # --------------------------------------------------------------------------- # Parse helpers # --------------------------------------------------------------------------- def _parse_json_array(text: str) -> list[dict]: """Extract a JSON array from LLM response text.""" # Try direct parse try: result = json.loads(text) if isinstance(result, list): return result if isinstance(result, dict): return [result] except json.JSONDecodeError: pass # Try extracting JSON array block match = re.search(r"\[[\s\S]*\]", text) if match: try: result = json.loads(match.group()) if isinstance(result, list): return result except json.JSONDecodeError: pass return [] def _parse_json_object(text: str) -> dict: """Extract a JSON object from LLM response text.""" try: result = json.loads(text) if isinstance(result, dict): return result except json.JSONDecodeError: pass match = re.search(r"\{[\s\S]*\}", text) if match: try: result = json.loads(match.group()) if isinstance(result, dict): return result except json.JSONDecodeError: pass return {} def _ensure_list(val) -> list: """Ensure value is a list.""" if isinstance(val, list): return val if isinstance(val, str): return [val] if val else [] return [] # --------------------------------------------------------------------------- # Decomposition Pass # --------------------------------------------------------------------------- class DecompositionPass: """Pass 0: Decompose Rich Controls into atomic candidates. Usage:: decomp = DecompositionPass(db=session) stats_0a = await decomp.run_pass0a(limit=100) stats_0b = await decomp.run_pass0b(limit=100) """ def __init__(self, db: Session, dedup_enabled: bool = False): self.db = db self._dedup = None if dedup_enabled: from compliance.services.control_dedup import ( ControlDedupChecker, DEDUP_ENABLED, ) if DEDUP_ENABLED: self._dedup = ControlDedupChecker(db) # ------------------------------------------------------------------- # Pass 0a: Obligation Extraction # ------------------------------------------------------------------- async def run_pass0a( self, limit: int = 0, batch_size: int = 0, use_anthropic: bool = False, category_filter: Optional[str] = None, source_filter: Optional[str] = None, ) -> dict: """Extract obligation candidates from rich controls. Args: limit: Max controls to process (0 = no limit). batch_size: Controls per LLM call (0 = use DECOMPOSITION_BATCH_SIZE env var, or 1 for single mode). Only >1 with Anthropic. use_anthropic: Use Anthropic API (True) or Ollama (False). category_filter: Only process controls matching this category (comma-separated, e.g. "security,privacy"). source_filter: Only process controls from these source regulations (comma-separated, e.g. "Maschinenverordnung,Cyber Resilience Act"). Matches against source_citation->>'source' using ILIKE. """ if batch_size <= 0: batch_size = DECOMPOSITION_BATCH_SIZE if use_anthropic else 1 # Find rich controls not yet decomposed query = """ SELECT cc.id, cc.control_id, cc.title, cc.objective, cc.requirements, cc.test_procedure, cc.source_citation, cc.category FROM canonical_controls cc WHERE cc.release_state NOT IN ('deprecated') AND cc.parent_control_uuid IS NULL AND NOT EXISTS ( SELECT 1 FROM obligation_candidates oc WHERE oc.parent_control_uuid = cc.id ) """ params = {} if category_filter: cats = [c.strip() for c in category_filter.split(",") if c.strip()] if cats: query += " AND cc.category IN :cats" params["cats"] = tuple(cats) if source_filter: sources = [s.strip() for s in source_filter.split(",") if s.strip()] if sources: clauses = [] for idx, src in enumerate(sources): key = f"src_{idx}" clauses.append(f"cc.source_citation::text ILIKE :{key}") params[key] = f"%{src}%" query += " AND (" + " OR ".join(clauses) + ")" query += " ORDER BY cc.created_at" if limit > 0: query += f" LIMIT {limit}" rows = self.db.execute(text(query), params).fetchall() stats = { "controls_processed": 0, "obligations_extracted": 0, "obligations_validated": 0, "obligations_rejected": 0, "controls_skipped_empty": 0, "llm_calls": 0, "errors": 0, "provider": "anthropic" if use_anthropic else "deterministic", "batch_size": batch_size, } # Prepare control data prepared = [] for row in rows: title = row[2] or "" objective = row[3] or "" req_str = _format_field(row[4] or "") test_str = _format_field(row[5] or "") source_str = _format_citation(row[6] or "") if not title and not objective and not req_str: stats["controls_skipped_empty"] += 1 continue prepared.append({ "uuid": str(row[0]), "control_id": row[1] or "", "title": title, "objective": objective, "requirements": req_str, "test_procedure": test_str, "source_ref": source_str, "category": row[7] or "", }) # Process in batches for i in range(0, len(prepared), batch_size): batch = prepared[i : i + batch_size] try: if use_anthropic and len(batch) > 1: # Batched Anthropic call prompt = _build_pass0a_batch_prompt(batch) llm_response = await _llm_anthropic( prompt=prompt, system_prompt=_PASS0A_SYSTEM_PROMPT, max_tokens=max(8192, len(batch) * 2000), ) stats["llm_calls"] += 1 results_by_id = _parse_json_object(llm_response) for ctrl in batch: raw_obls = results_by_id.get(ctrl["control_id"], []) if not isinstance(raw_obls, list): raw_obls = [raw_obls] if raw_obls else [] if not raw_obls: raw_obls = [_fallback_obligation(ctrl)] self._process_pass0a_obligations( raw_obls, ctrl["control_id"], ctrl["uuid"], stats ) stats["controls_processed"] += 1 elif use_anthropic: # Single Anthropic call ctrl = batch[0] prompt = _build_pass0a_prompt( title=ctrl["title"], objective=ctrl["objective"], requirements=ctrl["requirements"], test_procedure=ctrl["test_procedure"], source_ref=ctrl["source_ref"], ) llm_response = await _llm_anthropic( prompt=prompt, system_prompt=_PASS0A_SYSTEM_PROMPT, ) stats["llm_calls"] += 1 raw_obls = _parse_json_array(llm_response) if not raw_obls: raw_obls = [_fallback_obligation(ctrl)] self._process_pass0a_obligations( raw_obls, ctrl["control_id"], ctrl["uuid"], stats ) stats["controls_processed"] += 1 else: # Ollama (single only) from compliance.services.obligation_extractor import _llm_ollama ctrl = batch[0] prompt = _build_pass0a_prompt( title=ctrl["title"], objective=ctrl["objective"], requirements=ctrl["requirements"], test_procedure=ctrl["test_procedure"], source_ref=ctrl["source_ref"], ) llm_response = await _llm_ollama( prompt=prompt, system_prompt=_PASS0A_SYSTEM_PROMPT, ) stats["llm_calls"] += 1 raw_obls = _parse_json_array(llm_response) if not raw_obls: raw_obls = [_fallback_obligation(ctrl)] self._process_pass0a_obligations( raw_obls, ctrl["control_id"], ctrl["uuid"], stats ) stats["controls_processed"] += 1 # Commit after each successful sub-batch to avoid losing work self.db.commit() except Exception as e: ids = ", ".join(c["control_id"] for c in batch) logger.error("Pass 0a failed for [%s]: %s", ids, e) stats["errors"] += 1 try: self.db.rollback() except Exception: pass logger.info("Pass 0a: %s", stats) return stats _NORMATIVE_STRENGTH_MAP = { "muss": "must", "must": "must", "soll": "should", "should": "should", "kann": "may", "may": "may", } def _process_pass0a_obligations( self, raw_obligations: list[dict], control_id: str, control_uuid: str, stats: dict, ) -> None: """Validate and write obligation candidates from LLM output.""" for idx, raw in enumerate(raw_obligations): raw_strength = raw.get("normative_strength", "must").lower().strip() normative_strength = self._NORMATIVE_STRENGTH_MAP.get( raw_strength, "must" ) cand = ObligationCandidate( candidate_id=f"OC-{control_id}-{idx + 1:02d}", parent_control_uuid=control_uuid, obligation_text=raw.get("obligation_text", ""), action=raw.get("action", ""), object_=raw.get("object", ""), condition=raw.get("condition"), normative_strength=normative_strength, is_test_obligation=bool(raw.get("is_test_obligation", False)), is_reporting_obligation=bool(raw.get("is_reporting_obligation", False)), ) # Auto-detect test/reporting if LLM missed it if not cand.is_test_obligation and _TEST_RE.search(cand.obligation_text): cand.is_test_obligation = True if not cand.is_reporting_obligation and _REPORTING_RE.search(cand.obligation_text): cand.is_reporting_obligation = True # Quality gate + obligation type classification flags = quality_gate(cand) cand.quality_flags = flags cand.extraction_confidence = _compute_extraction_confidence(flags) cand.obligation_type = flags.get("obligation_type", "empfehlung") if passes_quality_gate(flags): cand.release_state = "validated" stats["obligations_validated"] += 1 else: cand.release_state = "rejected" stats["obligations_rejected"] += 1 self._write_obligation_candidate(cand) stats["obligations_extracted"] += 1 # ------------------------------------------------------------------- # Pass 0b: Atomic Control Composition # ------------------------------------------------------------------- async def run_pass0b( self, limit: int = 0, batch_size: int = 0, use_anthropic: bool = False, ) -> dict: """Compose atomic controls from validated obligation candidates. Args: limit: Max candidates to process (0 = no limit). batch_size: Commit interval (0 = auto). For LLM: API batch size. use_anthropic: Use Anthropic API (True) or deterministic engine (False). """ if batch_size <= 0: batch_size = DECOMPOSITION_BATCH_SIZE if use_anthropic else 50 query = """ SELECT oc.id, oc.candidate_id, oc.parent_control_uuid, oc.obligation_text, oc.action, oc.object, oc.condition, oc.is_test_obligation, oc.is_reporting_obligation, cc.title AS parent_title, cc.category AS parent_category, cc.source_citation AS parent_citation, cc.severity AS parent_severity, cc.control_id AS parent_control_id, oc.trigger_type, oc.is_implementation_specific FROM obligation_candidates oc JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid WHERE oc.release_state = 'validated' AND oc.merged_into_id IS NULL AND NOT EXISTS ( SELECT 1 FROM canonical_controls ac WHERE ac.parent_control_uuid = oc.parent_control_uuid AND ac.decomposition_method = 'pass0b' AND ac.release_state NOT IN ('deprecated', 'duplicate') AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%' ) """ if limit > 0: query += f" LIMIT {limit}" rows = self.db.execute(text(query)).fetchall() stats = { "candidates_processed": 0, "controls_created": 0, "llm_failures": 0, "llm_calls": 0, "errors": 0, "provider": "anthropic" if use_anthropic else "deterministic", "batch_size": batch_size, "dedup_enabled": self._dedup is not None, "dedup_linked": 0, "dedup_review": 0, "skipped_merged": 0, } # Prepare obligation data prepared = [] for row in rows: prepared.append({ "oc_id": str(row[0]), "candidate_id": row[1] or "", "parent_uuid": str(row[2]), "obligation_text": row[3] or "", "action": row[4] or "", "object": row[5] or "", "condition": row[6] or "", "is_test": row[7], "is_reporting": row[8], "parent_title": row[9] or "", "parent_category": row[10] or "", "parent_citation": row[11] or "", "parent_severity": row[12] or "medium", "parent_control_id": row[13] or "", "source_ref": _format_citation(row[11] or ""), "trigger_type": row[14] or "continuous", "is_implementation_specific": row[15] or False, }) # Process in batches for i in range(0, len(prepared), batch_size): batch = prepared[i : i + batch_size] try: if use_anthropic and len(batch) > 1: # Batched Anthropic call prompt = _build_pass0b_batch_prompt(batch) llm_response = await _llm_anthropic( prompt=prompt, system_prompt=_PASS0B_SYSTEM_PROMPT, max_tokens=min(16384, max(4096, len(batch) * 500)), ) stats["llm_calls"] += 1 results_by_id = _parse_json_object(llm_response) for obl in batch: parsed = results_by_id.get(obl["candidate_id"], {}) await self._process_pass0b_control(obl, parsed, stats) elif use_anthropic: obl = batch[0] prompt = _build_pass0b_prompt( obligation_text=obl["obligation_text"], action=obl["action"], object_=obl["object"], parent_title=obl["parent_title"], parent_category=obl["parent_category"], source_ref=obl["source_ref"], ) llm_response = await _llm_anthropic( prompt=prompt, system_prompt=_PASS0B_SYSTEM_PROMPT, ) stats["llm_calls"] += 1 parsed = _parse_json_object(llm_response) await self._process_pass0b_control(obl, parsed, stats) else: # Deterministic engine — no LLM required for obl in batch: await self._route_and_compose(obl, stats) # Commit after each successful sub-batch self.db.commit() except Exception as e: ids = ", ".join(o["candidate_id"] for o in batch) logger.error("Pass 0b failed for [%s]: %s", ids, e) stats["errors"] += 1 try: self.db.rollback() except Exception: pass logger.info("Pass 0b: %s", stats) return stats async def _route_and_compose( self, obl: dict, stats: dict, ) -> None: """Route an obligation through the framework detection layer, then compose atomic controls. Routing types: - atomic: compose directly via _compose_deterministic - compound: split compound verbs, compose each - framework_container: decompose via framework registry, then compose each sub-obligation """ from compliance.services.framework_decomposition import ( classify_routing, decompose_framework_container, ) routing = classify_routing( obligation_text=obl["obligation_text"], action_raw=obl["action"], object_raw=obl["object"], condition_raw=obl.get("condition"), ) if routing.routing_type == "framework_container" and routing.framework_ref: # Decompose framework container into sub-obligations result = decompose_framework_container( obligation_candidate_id=obl["candidate_id"], parent_control_id=obl["parent_control_id"], obligation_text=obl["obligation_text"], framework_ref=routing.framework_ref, framework_domain=routing.framework_domain, ) stats.setdefault("framework_decomposed", 0) stats.setdefault("framework_sub_obligations", 0) if result.release_state == "decomposed" and result.decomposed_obligations: stats["framework_decomposed"] += 1 stats["framework_sub_obligations"] += len(result.decomposed_obligations) logger.info( "Framework decomposition: %s → %s/%s → %d sub-obligations", obl["candidate_id"], routing.framework_ref, routing.framework_domain, len(result.decomposed_obligations), ) # Compose each sub-obligation for d_obl in result.decomposed_obligations: sub_obl = { **obl, "obligation_text": d_obl.obligation_text, "action": d_obl.action_raw, "object": d_obl.object_raw, } sub_actions = _split_compound_action(sub_obl["action"]) for sub_action in sub_actions: atomic = _compose_deterministic( obligation_text=sub_obl["obligation_text"], action=sub_action, object_=sub_obl["object"], parent_title=obl["parent_title"], parent_severity=obl["parent_severity"], parent_category=obl["parent_category"], is_test=obl["is_test"], is_reporting=obl["is_reporting"], trigger_type=obl.get("trigger_type"), condition=obl.get("condition"), ) # Enrich gen_meta with framework info atomic._framework_ref = routing.framework_ref # type: ignore[attr-defined] atomic._framework_domain = routing.framework_domain # type: ignore[attr-defined] atomic._framework_subcontrol_id = d_obl.subcontrol_id # type: ignore[attr-defined] atomic._decomposition_source = "framework_decomposition" # type: ignore[attr-defined] await self._process_pass0b_control( obl, {}, stats, atomic=atomic, ) return else: # Unmatched framework — fall through to normal composition logger.warning( "Framework decomposition unmatched: %s — %s", obl["candidate_id"], result.issues, ) # Atomic or compound or unmatched framework: normal composition sub_actions = _split_compound_action(obl["action"]) for sub_action in sub_actions: atomic = _compose_deterministic( obligation_text=obl["obligation_text"], action=sub_action, object_=obl["object"], parent_title=obl["parent_title"], parent_severity=obl["parent_severity"], parent_category=obl["parent_category"], is_test=obl["is_test"], is_reporting=obl["is_reporting"], trigger_type=obl.get("trigger_type"), condition=obl.get("condition"), ) await self._process_pass0b_control( obl, {}, stats, atomic=atomic, ) async def _process_pass0b_control( self, obl: dict, parsed: dict, stats: dict, atomic: Optional[AtomicControlCandidate] = None, ) -> None: """Create atomic control from deterministic engine, LLM output, or fallback. If dedup is enabled, checks for duplicates before insertion: - LINK: adds parent link to existing control instead of creating new - REVIEW: queues for human review, does not create control - NEW: creates new control and indexes in Qdrant """ if atomic is not None: # Deterministic engine — atomic already composed pass elif not parsed or not parsed.get("title"): # LLM failed → use deterministic engine as fallback atomic = _compose_deterministic( obligation_text=obl["obligation_text"], action=obl["action"], object_=obl["object"], parent_title=obl["parent_title"], parent_severity=obl["parent_severity"], parent_category=obl["parent_category"], is_test=obl["is_test"], is_reporting=obl["is_reporting"], condition=obl.get("condition"), ) stats["llm_failures"] += 1 else: atomic = AtomicControlCandidate( title=parsed.get("title", "")[:200], objective=parsed.get("objective", "")[:2000], requirements=_ensure_list(parsed.get("requirements", [])), test_procedure=_ensure_list(parsed.get("test_procedure", [])), evidence=_ensure_list(parsed.get("evidence", [])), severity=_normalize_severity( parsed.get("severity", obl["parent_severity"]) ), category=parsed.get("category", obl["parent_category"]), ) atomic.parent_control_uuid = obl["parent_uuid"] atomic.obligation_candidate_id = obl["candidate_id"] # Cap severity for implementation-specific obligations if obl.get("is_implementation_specific") and atomic.severity in ( "critical", "high" ): atomic.severity = "medium" # Override category for test obligations if obl.get("is_test"): atomic.category = "testing" # ── Dedup check (if enabled) ──────────────────────────── if self._dedup: pattern_id = None # Try to get pattern_id from parent control pid_row = self.db.execute(text( "SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)" ), {"uid": obl["parent_uuid"]}).fetchone() if pid_row: pattern_id = pid_row[0] result = await self._dedup.check_duplicate( action=obl.get("action", ""), obj=obl.get("object", ""), title=atomic.title, pattern_id=pattern_id, ) if result.verdict == "link": self._dedup.add_parent_link( control_uuid=result.matched_control_uuid, parent_control_uuid=obl["parent_uuid"], link_type="dedup_merge", confidence=result.similarity_score, ) stats.setdefault("dedup_linked", 0) stats["dedup_linked"] += 1 stats["candidates_processed"] += 1 logger.info("Dedup LINK: %s → %s (%.3f, %s)", atomic.title[:60], result.matched_control_id, result.similarity_score, result.stage) return if result.verdict == "review": self._dedup.write_review( candidate_control_id=atomic.candidate_id or "", candidate_title=atomic.title, candidate_objective=atomic.objective, result=result, parent_control_uuid=obl["parent_uuid"], obligation_candidate_id=obl.get("oc_id"), ) stats.setdefault("dedup_review", 0) stats["dedup_review"] += 1 stats["candidates_processed"] += 1 logger.info("Dedup REVIEW: %s ↔ %s (%.3f, %s)", atomic.title[:60], result.matched_control_id, result.similarity_score, result.stage) return # ── Create new atomic control ─────────────────────────── seq = self._next_atomic_seq(obl["parent_control_id"]) atomic.candidate_id = f"{obl['parent_control_id']}-A{seq:02d}" new_uuid = self._write_atomic_control(atomic, obl) self.db.execute( text(""" UPDATE obligation_candidates SET release_state = 'composed' WHERE id = CAST(:oc_id AS uuid) """), {"oc_id": obl["oc_id"]}, ) # Index in Qdrant for future dedup checks if self._dedup and new_uuid: pattern_id_val = None pid_row2 = self.db.execute(text( "SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)" ), {"uid": obl["parent_uuid"]}).fetchone() if pid_row2: pattern_id_val = pid_row2[0] if pattern_id_val: await self._dedup.index_control( control_uuid=new_uuid, control_id=atomic.candidate_id, title=atomic.title, action=obl.get("action", ""), obj=obl.get("object", ""), pattern_id=pattern_id_val, ) stats["controls_created"] += 1 stats["candidates_processed"] += 1 # ------------------------------------------------------------------- # Merge Pass: Deduplicate implementation-level obligations # ------------------------------------------------------------------- def run_merge_pass(self) -> dict: """Merge implementation-level duplicate obligations within each parent. When the same parent control has multiple obligations with nearly identical action+object (e.g. "SMS-Verbot" + "Policy-as-Code" both implementing a communication restriction), keep the more abstract one and mark the concrete one as merged. No LLM calls — purely rule-based using text similarity. """ stats = { "parents_checked": 0, "obligations_merged": 0, "obligations_kept": 0, } # Get all parents that have >1 validated obligation parents = self.db.execute(text(""" SELECT parent_control_uuid, count(*) AS cnt FROM obligation_candidates WHERE release_state = 'validated' AND merged_into_id IS NULL GROUP BY parent_control_uuid HAVING count(*) > 1 """)).fetchall() for parent_uuid, cnt in parents: stats["parents_checked"] += 1 obligs = self.db.execute(text(""" SELECT id, candidate_id, obligation_text, action, object FROM obligation_candidates WHERE parent_control_uuid = CAST(:pid AS uuid) AND release_state = 'validated' AND merged_into_id IS NULL ORDER BY created_at """), {"pid": str(parent_uuid)}).fetchall() merged_ids = set() oblig_list = list(obligs) for i in range(len(oblig_list)): if str(oblig_list[i][0]) in merged_ids: continue for j in range(i + 1, len(oblig_list)): if str(oblig_list[j][0]) in merged_ids: continue action_i = (oblig_list[i][3] or "").lower().strip() action_j = (oblig_list[j][3] or "").lower().strip() obj_i = (oblig_list[i][4] or "").lower().strip() obj_j = (oblig_list[j][4] or "").lower().strip() # Check if actions are similar enough to be duplicates if not _text_similar(action_i, action_j, threshold=0.75): continue if not _text_similar(obj_i, obj_j, threshold=0.60): continue # Keep the more abstract one (shorter text = less specific) text_i = oblig_list[i][2] or "" text_j = oblig_list[j][2] or "" if _is_more_implementation_specific(text_j, text_i): survivor_id = str(oblig_list[i][0]) merged_id = str(oblig_list[j][0]) else: survivor_id = str(oblig_list[j][0]) merged_id = str(oblig_list[i][0]) self.db.execute(text(""" UPDATE obligation_candidates SET release_state = 'merged', merged_into_id = CAST(:survivor AS uuid) WHERE id = CAST(:merged AS uuid) """), {"survivor": survivor_id, "merged": merged_id}) merged_ids.add(merged_id) stats["obligations_merged"] += 1 # Commit per parent to avoid large transactions self.db.commit() stats["obligations_kept"] = self.db.execute(text(""" SELECT count(*) FROM obligation_candidates WHERE release_state = 'validated' AND merged_into_id IS NULL """)).fetchone()[0] logger.info("Merge pass: %s", stats) return stats # ------------------------------------------------------------------- # Enrich Pass: Add metadata to obligations # ------------------------------------------------------------------- def enrich_obligations(self) -> dict: """Add trigger_type and is_implementation_specific to obligations. Rule-based enrichment — no LLM calls. """ stats = { "enriched": 0, "trigger_event": 0, "trigger_periodic": 0, "trigger_continuous": 0, "implementation_specific": 0, } obligs = self.db.execute(text(""" SELECT id, obligation_text, condition, action, object FROM obligation_candidates WHERE release_state = 'validated' AND merged_into_id IS NULL AND trigger_type IS NULL """)).fetchall() for row in obligs: oc_id = str(row[0]) obl_text = row[1] or "" condition = row[2] or "" action = row[3] or "" obj = row[4] or "" trigger = _classify_trigger_type(obl_text, condition) impl = _is_implementation_specific_text(obl_text, action, obj) self.db.execute(text(""" UPDATE obligation_candidates SET trigger_type = :trigger, is_implementation_specific = :impl WHERE id = CAST(:oid AS uuid) """), {"trigger": trigger, "impl": impl, "oid": oc_id}) stats["enriched"] += 1 stats[f"trigger_{trigger}"] += 1 if impl: stats["implementation_specific"] += 1 self.db.commit() logger.info("Enrich pass: %s", stats) return stats # ------------------------------------------------------------------- # Decomposition Status # ------------------------------------------------------------------- def decomposition_status(self) -> dict: """Return decomposition progress.""" row = self.db.execute(text(""" SELECT (SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NULL AND release_state NOT IN ('deprecated')) AS rich_controls, (SELECT count(DISTINCT parent_control_uuid) FROM obligation_candidates) AS decomposed_controls, (SELECT count(*) FROM obligation_candidates) AS total_candidates, (SELECT count(*) FROM obligation_candidates WHERE release_state = 'validated') AS validated, (SELECT count(*) FROM obligation_candidates WHERE release_state = 'rejected') AS rejected, (SELECT count(*) FROM obligation_candidates WHERE release_state = 'composed') AS composed, (SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NOT NULL) AS atomic_controls, (SELECT count(*) FROM obligation_candidates WHERE release_state = 'merged') AS merged, (SELECT count(*) FROM obligation_candidates WHERE trigger_type IS NOT NULL) AS enriched """)).fetchone() validated_for_0b = row[3] - (row[7] or 0) # validated minus merged return { "rich_controls": row[0], "decomposed_controls": row[1], "total_candidates": row[2], "validated": row[3], "rejected": row[4], "composed": row[5], "atomic_controls": row[6], "merged": row[7] or 0, "enriched": row[8] or 0, "ready_for_pass0b": validated_for_0b, "decomposition_pct": round(row[1] / max(row[0], 1) * 100, 1), "composition_pct": round(row[5] / max(validated_for_0b, 1) * 100, 1), } # ------------------------------------------------------------------- # DB Writers # ------------------------------------------------------------------- def _write_obligation_candidate(self, cand: ObligationCandidate) -> None: """Insert an obligation candidate into the DB.""" self.db.execute( text(""" INSERT INTO obligation_candidates ( parent_control_uuid, candidate_id, obligation_text, action, object, condition, normative_strength, is_test_obligation, is_reporting_obligation, extraction_confidence, quality_flags, release_state ) VALUES ( CAST(:parent_uuid AS uuid), :candidate_id, :obligation_text, :action, :object, :condition, :normative_strength, :is_test, :is_reporting, :confidence, :quality_flags, :release_state ) """), { "parent_uuid": cand.parent_control_uuid, "candidate_id": cand.candidate_id, "obligation_text": cand.obligation_text, "action": cand.action, "object": cand.object_, "condition": cand.condition, "normative_strength": cand.normative_strength, "is_test": cand.is_test_obligation, "is_reporting": cand.is_reporting_obligation, "confidence": cand.extraction_confidence, "quality_flags": json.dumps(cand.quality_flags), "release_state": cand.release_state, }, ) def _write_atomic_control( self, atomic: AtomicControlCandidate, obl: dict, ) -> Optional[str]: """Insert an atomic control and create parent link. Returns the UUID of the newly created control, or None on failure. Checks merge_hint to prevent duplicate controls under the same parent. """ parent_uuid = obl["parent_uuid"] candidate_id = obl["candidate_id"] # ── Duplicate Guard: skip if same merge_hint already exists ── merge_hint = getattr(atomic, "source_regulation", "") or "" if merge_hint: existing = self.db.execute( text(""" SELECT id::text FROM canonical_controls WHERE parent_control_uuid = CAST(:parent AS uuid) AND generation_metadata->>'merge_group_hint' = :hint AND release_state NOT IN ('rejected', 'deprecated', 'duplicate') LIMIT 1 """), {"parent": parent_uuid, "hint": merge_hint}, ).fetchone() if existing: logger.debug( "Duplicate guard: skipping %s — merge_hint %s already exists as %s", candidate_id, merge_hint, existing[0], ) return existing[0] result = self.db.execute( text(""" INSERT INTO canonical_controls ( control_id, title, objective, rationale, scope, requirements, test_procedure, evidence, severity, open_anchors, category, release_state, parent_control_uuid, decomposition_method, generation_metadata, framework_id, generation_strategy, pipeline_version ) VALUES ( :control_id, :title, :objective, :rationale, :scope, :requirements, :test_procedure, :evidence, :severity, :open_anchors, :category, 'draft', CAST(:parent_uuid AS uuid), 'pass0b', :gen_meta, CAST(:framework_id AS uuid), 'pass0b', 2 ) RETURNING id::text """), { "control_id": atomic.candidate_id, "title": atomic.title, "objective": atomic.objective, "rationale": getattr(atomic, "rationale", None) or "Aus Obligation abgeleitet.", "scope": json.dumps({}), "requirements": json.dumps(atomic.requirements), "test_procedure": json.dumps(atomic.test_procedure), "evidence": json.dumps(atomic.evidence), "severity": atomic.severity, "open_anchors": json.dumps([]), "category": atomic.category, "parent_uuid": parent_uuid, "gen_meta": json.dumps({ "decomposition_source": candidate_id, "decomposition_method": "pass0b", "engine_version": "v2", "action_object_class": getattr(atomic, "domain", ""), "merge_group_hint": atomic.source_regulation or "", "decomposition_confidence": getattr( atomic, "_decomposition_confidence", None ), "statement": getattr(atomic, "_statement", ""), "pattern_candidates": getattr(atomic, "_pattern_candidates", []), "deadline_hours": getattr(atomic, "_deadline_hours", None), "frequency": getattr(atomic, "_frequency", None), "validation_issues": getattr(atomic, "_validation_issues", []), "is_composite": getattr(atomic, "_is_composite", False), "atomicity": getattr(atomic, "_atomicity", "atomic"), "requires_decomposition": getattr(atomic, "_requires_decomposition", False), "framework_ref": getattr(atomic, "_framework_ref", None), "framework_domain": getattr(atomic, "_framework_domain", None), "framework_subcontrol_id": getattr(atomic, "_framework_subcontrol_id", None), "decomposition_source": getattr(atomic, "_decomposition_source", "direct"), }), "framework_id": "14b1bdd2-abc7-4a43-adae-14471ee5c7cf", }, ) row = result.fetchone() new_uuid = row[0] if row else None # Create M:N parent link (control_parent_links) if new_uuid: citation = _parse_citation(obl.get("parent_citation", "")) self.db.execute( text(""" INSERT INTO control_parent_links (control_uuid, parent_control_uuid, link_type, confidence, source_regulation, source_article, obligation_candidate_id) VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'decomposition', 1.0, :sr, :sa, CAST(:oci AS uuid)) ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING """), { "cu": new_uuid, "pu": parent_uuid, "sr": citation.get("source", ""), "sa": citation.get("article", ""), "oci": obl["oc_id"], }, ) return new_uuid def _next_atomic_seq(self, parent_control_id: str) -> int: """Get the next sequence number for atomic controls under a parent.""" result = self.db.execute( text(""" SELECT count(*) FROM canonical_controls WHERE parent_control_uuid = ( SELECT id FROM canonical_controls WHERE control_id = :parent_id LIMIT 1 ) """), {"parent_id": parent_control_id}, ).fetchone() return (result[0] if result else 0) + 1 # ------------------------------------------------------------------- # Anthropic Batch API: Submit all controls as async batch (50% off) # ------------------------------------------------------------------- async def submit_batch_pass0a( self, limit: int = 0, batch_size: int = 5, category_filter: Optional[str] = None, source_filter: Optional[str] = None, ) -> dict: """Create an Anthropic Batch API request for Pass 0a. Groups controls into content-batches of `batch_size`, then submits all batches as one Anthropic Batch (up to 10,000 requests). Returns batch metadata for polling. """ query = """ SELECT cc.id, cc.control_id, cc.title, cc.objective, cc.requirements, cc.test_procedure, cc.source_citation, cc.category FROM canonical_controls cc WHERE cc.release_state NOT IN ('deprecated') AND cc.parent_control_uuid IS NULL AND NOT EXISTS ( SELECT 1 FROM obligation_candidates oc WHERE oc.parent_control_uuid = cc.id ) """ params = {} if category_filter: cats = [c.strip() for c in category_filter.split(",") if c.strip()] if cats: query += " AND cc.category IN :cats" params["cats"] = tuple(cats) if source_filter: sources = [s.strip() for s in source_filter.split(",") if s.strip()] if sources: clauses = [] for idx, src in enumerate(sources): key = f"src_{idx}" clauses.append(f"cc.source_citation::text ILIKE :{key}") params[key] = f"%{src}%" query += " AND (" + " OR ".join(clauses) + ")" query += " ORDER BY cc.created_at" if limit > 0: query += f" LIMIT {limit}" rows = self.db.execute(text(query), params).fetchall() # Prepare control data (skip empty) prepared = [] for row in rows: title = row[2] or "" objective = row[3] or "" req_str = _format_field(row[4] or "") if not title and not objective and not req_str: continue prepared.append({ "uuid": str(row[0]), "control_id": row[1] or "", "title": title, "objective": objective, "requirements": req_str, "test_procedure": _format_field(row[5] or ""), "source_ref": _format_citation(row[6] or ""), "category": row[7] or "", }) if not prepared: return {"status": "empty", "total_controls": 0} # Build batch requests (each request = batch_size controls) requests = [] for i in range(0, len(prepared), batch_size): batch = prepared[i : i + batch_size] if len(batch) > 1: prompt = _build_pass0a_batch_prompt(batch) else: ctrl = batch[0] prompt = _build_pass0a_prompt( title=ctrl["title"], objective=ctrl["objective"], requirements=ctrl["requirements"], test_procedure=ctrl["test_procedure"], source_ref=ctrl["source_ref"], ) # Control IDs in custom_id for result mapping ids_str = "+".join(c["control_id"] for c in batch) requests.append({ "custom_id": f"p0a_{ids_str}", "params": { "model": ANTHROPIC_MODEL, "max_tokens": max(8192, len(batch) * 2000), "system": [ { "type": "text", "text": _PASS0A_SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}, } ], "messages": [{"role": "user", "content": prompt}], }, }) batch_result = await create_anthropic_batch(requests) batch_id = batch_result.get("id", "") logger.info( "Batch API submitted: %s — %d requests (%d controls, batch_size=%d)", batch_id, len(requests), len(prepared), batch_size, ) return { "status": "submitted", "batch_id": batch_id, "total_controls": len(prepared), "total_requests": len(requests), "batch_size": batch_size, "category_filter": category_filter, "source_filter": source_filter, } async def submit_batch_pass0b( self, limit: int = 0, batch_size: int = 5, ) -> dict: """Create an Anthropic Batch API request for Pass 0b.""" query = """ SELECT oc.id, oc.candidate_id, oc.parent_control_uuid, oc.obligation_text, oc.action, oc.object, oc.is_test_obligation, oc.is_reporting_obligation, cc.title AS parent_title, cc.category AS parent_category, cc.source_citation AS parent_citation, cc.severity AS parent_severity, cc.control_id AS parent_control_id FROM obligation_candidates oc JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid WHERE oc.release_state = 'validated' AND NOT EXISTS ( SELECT 1 FROM canonical_controls ac WHERE ac.parent_control_uuid = oc.parent_control_uuid AND ac.decomposition_method = 'pass0b' AND ac.release_state NOT IN ('deprecated', 'duplicate') AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%' ) """ if limit > 0: query += f" LIMIT {limit}" rows = self.db.execute(text(query)).fetchall() prepared = [] for row in rows: prepared.append({ "oc_id": str(row[0]), "candidate_id": row[1] or "", "parent_uuid": str(row[2]), "obligation_text": row[3] or "", "action": row[4] or "", "object": row[5] or "", "is_test": row[6], "is_reporting": row[7], "parent_title": row[8] or "", "parent_category": row[9] or "", "parent_citation": row[10] or "", "parent_severity": row[11] or "medium", "parent_control_id": row[12] or "", "source_ref": _format_citation(row[10] or ""), }) if not prepared: return {"status": "empty", "total_candidates": 0} requests = [] for i in range(0, len(prepared), batch_size): batch = prepared[i : i + batch_size] if len(batch) > 1: prompt = _build_pass0b_batch_prompt(batch) else: obl = batch[0] prompt = _build_pass0b_prompt( obligation_text=obl["obligation_text"], action=obl["action"], object_=obl["object"], parent_title=obl["parent_title"], parent_category=obl["parent_category"], source_ref=obl["source_ref"], ) ids_str = "+".join(o["candidate_id"] for o in batch) requests.append({ "custom_id": f"p0b_{ids_str}", "params": { "model": ANTHROPIC_MODEL, "max_tokens": max(8192, len(batch) * 1500), "system": [ { "type": "text", "text": _PASS0B_SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}, } ], "messages": [{"role": "user", "content": prompt}], }, }) batch_result = await create_anthropic_batch(requests) batch_id = batch_result.get("id", "") logger.info( "Batch API Pass 0b submitted: %s — %d requests (%d candidates)", batch_id, len(requests), len(prepared), ) return { "status": "submitted", "batch_id": batch_id, "total_candidates": len(prepared), "total_requests": len(requests), "batch_size": batch_size, } async def process_batch_results( self, batch_id: str, pass_type: str = "0a", ) -> dict: """Fetch and process results from a completed Anthropic batch. Args: batch_id: Anthropic batch ID. pass_type: "0a" or "0b". """ # Check status first status = await check_batch_status(batch_id) if status.get("processing_status") != "ended": return { "status": "not_ready", "processing_status": status.get("processing_status"), "request_counts": status.get("request_counts", {}), } results = await fetch_batch_results(batch_id) stats = { "results_processed": 0, "results_succeeded": 0, "results_failed": 0, "errors": 0, } if pass_type == "0a": stats.update({ "controls_processed": 0, "obligations_extracted": 0, "obligations_validated": 0, "obligations_rejected": 0, }) else: stats.update({ "candidates_processed": 0, "controls_created": 0, "llm_failures": 0, }) for result in results: custom_id = result.get("custom_id", "") result_data = result.get("result", {}) stats["results_processed"] += 1 if result_data.get("type") != "succeeded": stats["results_failed"] += 1 logger.warning("Batch result failed: %s — %s", custom_id, result_data) continue stats["results_succeeded"] += 1 message = result_data.get("message", {}) content = message.get("content", []) text_content = content[0].get("text", "") if content else "" try: if pass_type == "0a": self._handle_batch_result_0a(custom_id, text_content, stats) else: await self._handle_batch_result_0b(custom_id, text_content, stats) except Exception as e: logger.error("Processing batch result %s: %s", custom_id, e) stats["errors"] += 1 self.db.commit() stats["status"] = "completed" return stats def _handle_batch_result_0a( self, custom_id: str, text_content: str, stats: dict, ) -> None: """Process a single Pass 0a batch result.""" # custom_id format: p0a_CTRL-001+CTRL-002+... prefix = "p0a_" control_ids = custom_id[len(prefix):].split("+") if custom_id.startswith(prefix) else [] if len(control_ids) == 1: raw_obls = _parse_json_array(text_content) control_id = control_ids[0] uuid_row = self.db.execute( text("SELECT id FROM canonical_controls WHERE control_id = :cid LIMIT 1"), {"cid": control_id}, ).fetchone() if not uuid_row: return control_uuid = str(uuid_row[0]) if not raw_obls: raw_obls = [{"obligation_text": control_id, "action": "sicherstellen", "object": control_id}] self._process_pass0a_obligations(raw_obls, control_id, control_uuid, stats) stats["controls_processed"] += 1 else: results_by_id = _parse_json_object(text_content) for control_id in control_ids: uuid_row = self.db.execute( text("SELECT id FROM canonical_controls WHERE control_id = :cid LIMIT 1"), {"cid": control_id}, ).fetchone() if not uuid_row: continue control_uuid = str(uuid_row[0]) raw_obls = results_by_id.get(control_id, []) if not isinstance(raw_obls, list): raw_obls = [raw_obls] if raw_obls else [] if not raw_obls: raw_obls = [{"obligation_text": control_id, "action": "sicherstellen", "object": control_id}] self._process_pass0a_obligations(raw_obls, control_id, control_uuid, stats) stats["controls_processed"] += 1 async def _handle_batch_result_0b( self, custom_id: str, text_content: str, stats: dict, ) -> None: """Process a single Pass 0b batch result.""" prefix = "p0b_" candidate_ids = custom_id[len(prefix):].split("+") if custom_id.startswith(prefix) else [] if len(candidate_ids) == 1: parsed = _parse_json_object(text_content) obl = self._load_obligation_for_0b(candidate_ids[0]) if obl: await self._process_pass0b_control(obl, parsed, stats) else: results_by_id = _parse_json_object(text_content) for cand_id in candidate_ids: parsed = results_by_id.get(cand_id, {}) obl = self._load_obligation_for_0b(cand_id) if obl: await self._process_pass0b_control(obl, parsed, stats) def _load_obligation_for_0b(self, candidate_id: str) -> Optional[dict]: """Load obligation data needed for Pass 0b processing.""" row = self.db.execute( text(""" SELECT oc.id, oc.candidate_id, oc.parent_control_uuid, oc.obligation_text, oc.action, oc.object, oc.is_test_obligation, oc.is_reporting_obligation, cc.title, cc.category, cc.source_citation, cc.severity, cc.control_id FROM obligation_candidates oc JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid WHERE oc.candidate_id = :cid """), {"cid": candidate_id}, ).fetchone() if not row: return None return { "oc_id": str(row[0]), "candidate_id": row[1] or "", "parent_uuid": str(row[2]), "obligation_text": row[3] or "", "action": row[4] or "", "object": row[5] or "", "is_test": row[6], "is_reporting": row[7], "parent_title": row[8] or "", "parent_category": row[9] or "", "parent_citation": row[10] or "", "parent_severity": row[11] or "medium", "parent_control_id": row[12] or "", "source_ref": _format_citation(row[10] or ""), } # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _fallback_obligation(ctrl: dict) -> dict: """Create a single fallback obligation when LLM returns nothing.""" return { "obligation_text": ctrl.get("objective") or ctrl.get("title", ""), "action": "sicherstellen", "object": ctrl.get("title", ""), "condition": None, "normative_strength": "must", "is_test_obligation": False, "is_reporting_obligation": False, } def _format_field(value) -> str: """Format a requirements/test_procedure field for the LLM prompt.""" if not value: return "" if isinstance(value, str): try: parsed = json.loads(value) if isinstance(parsed, list): return "\n".join(f"- {item}" for item in parsed) return value except (json.JSONDecodeError, TypeError): return value if isinstance(value, list): return "\n".join(f"- {item}" for item in value) return str(value) def _format_citation(citation) -> str: """Format source_citation for display.""" if not citation: return "" if isinstance(citation, str): try: c = json.loads(citation) if isinstance(c, dict): parts = [] if c.get("source"): parts.append(c["source"]) if c.get("article"): parts.append(c["article"]) if c.get("paragraph"): parts.append(c["paragraph"]) return " ".join(parts) if parts else citation except (json.JSONDecodeError, TypeError): return citation return str(citation) def _parse_citation(citation) -> dict: """Parse source_citation JSONB into a dict with source/article/paragraph.""" if not citation: return {} if isinstance(citation, dict): return citation if isinstance(citation, str): try: c = json.loads(citation) if isinstance(c, dict): return c except (json.JSONDecodeError, TypeError): pass return {} def _compute_extraction_confidence(flags: dict) -> float: """Compute confidence score from quality flags.""" score = 0.0 weights = { "has_normative_signal": 0.30, "single_action": 0.20, "not_rationale": 0.20, "not_evidence_only": 0.15, "min_length": 0.10, "has_parent_link": 0.05, } for flag, weight in weights.items(): if flags.get(flag, False): score += weight return round(score, 2) def _normalize_severity(val: str) -> str: """Normalize severity value.""" val = (val or "medium").lower().strip() if val in ("critical", "high", "medium", "low"): return val return "medium" # Action-type-based severity calibration: not every atomic control # inherits the parent's severity. Definition and review controls are # typically medium, while implementation controls stay high. _ACTION_SEVERITY_CAP: dict[str, str] = { "define": "medium", "review": "medium", "document": "medium", "report": "medium", "test": "medium", "implement": "high", "configure": "high", "monitor": "high", "enforce": "high", "prevent": "high", "exclude": "high", "forbid": "high", "invalidate": "high", "issue": "high", "rotate": "medium", } # Severity ordering for cap comparison _SEVERITY_ORDER = {"low": 0, "medium": 1, "high": 2, "critical": 3} def _calibrate_severity(parent_severity: str, action_type: str) -> str: """Calibrate severity based on action type. Implementation/enforcement inherits parent severity. Definition/review/test/documentation caps at medium. """ parent = _normalize_severity(parent_severity) cap = _ACTION_SEVERITY_CAP.get(action_type) if not cap: return parent # Return the lower of parent severity and action-type cap if _SEVERITY_ORDER.get(parent, 1) <= _SEVERITY_ORDER.get(cap, 1): return parent return cap # _template_fallback removed — replaced by _compose_deterministic engine