"""Decomposition Pass — Split Rich Controls into Atomic Controls. Pass 0 of the Multi-Layer Control Architecture migration. Runs BEFORE Passes 1-5 (obligation linkage, pattern classification, etc.). Two sub-passes: Pass 0a: Obligation Extraction — extract individual normative obligations from a Rich Control using LLM with strict guardrails. Pass 0b: Atomic Control Composition — turn each obligation candidate into a standalone atomic control record. Plus a Quality Gate that validates extraction results. Guardrails (the 6 rules): 1. Only normative statements (müssen, sicherzustellen, verpflichtet, ...) 2. One main verb per obligation 3. Test obligations separate from operational obligations 4. Reporting obligations separate 5. Don't split at evidence level 6. Parent link always preserved """ import json import logging import re import uuid from dataclasses import dataclass, field from typing import Optional from sqlalchemy import text from sqlalchemy.orm import Session logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Normative signal detection (Rule 1) # --------------------------------------------------------------------------- _NORMATIVE_SIGNALS = [ r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b", r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b", r"\bist\s+verpflichtet\b", r"\bist\s+zu\s+\w+en\b", r"\bsind\s+zu\s+\w+en\b", r"\bhat\s+zu\s+\w+en\b", r"\bhaben\s+zu\s+\w+en\b", r"\bsoll\b", r"\bsollen\b", r"\bgewährleisten\b", r"\bsicherstellen\b", r"\bshall\b", r"\bmust\b", r"\brequired\b", r"\bshould\b", r"\bensure\b", ] _NORMATIVE_RE = re.compile("|".join(_NORMATIVE_SIGNALS), re.IGNORECASE) _RATIONALE_SIGNALS = [ r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung", r"\bbecause\b", r"\breason\b", r"\brationale\b", r"\bkönnen\s+.*\s+verursachen\b", r"\bführt\s+zu\b", ] _RATIONALE_RE = re.compile("|".join(_RATIONALE_SIGNALS), re.IGNORECASE) _TEST_SIGNALS = [ r"\btesten\b", r"\btest\b", r"\bprüfung\b", r"\bprüfen\b", r"\bgetestet\b", r"\bwirksamkeit\b", r"\baudit\b", r"\bregelmäßig\b.*\b(prüf|test|kontroll)", r"\beffectiveness\b", r"\bverif", ] _TEST_RE = re.compile("|".join(_TEST_SIGNALS), re.IGNORECASE) _REPORTING_SIGNALS = [ r"\bmelden\b", r"\bmeldung\b", r"\bunterricht", r"\binformieren\b", r"\bbenachricht", r"\bnotif", r"\breport\b", r"\bbehörd", ] _REPORTING_RE = re.compile("|".join(_REPORTING_SIGNALS), re.IGNORECASE) # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class ObligationCandidate: """A single normative obligation extracted from a Rich Control.""" candidate_id: str = "" parent_control_uuid: str = "" obligation_text: str = "" action: str = "" object_: str = "" condition: Optional[str] = None normative_strength: str = "must" is_test_obligation: bool = False is_reporting_obligation: bool = False extraction_confidence: float = 0.0 quality_flags: dict = field(default_factory=dict) release_state: str = "extracted" def to_dict(self) -> dict: return { "candidate_id": self.candidate_id, "parent_control_uuid": self.parent_control_uuid, "obligation_text": self.obligation_text, "action": self.action, "object": self.object_, "condition": self.condition, "normative_strength": self.normative_strength, "is_test_obligation": self.is_test_obligation, "is_reporting_obligation": self.is_reporting_obligation, "extraction_confidence": self.extraction_confidence, "quality_flags": self.quality_flags, "release_state": self.release_state, } @dataclass class AtomicControlCandidate: """An atomic control composed from a single ObligationCandidate.""" candidate_id: str = "" parent_control_uuid: str = "" obligation_candidate_id: str = "" title: str = "" objective: str = "" requirements: list = field(default_factory=list) test_procedure: list = field(default_factory=list) evidence: list = field(default_factory=list) severity: str = "medium" category: str = "" domain: str = "" source_regulation: str = "" source_article: str = "" def to_dict(self) -> dict: return { "candidate_id": self.candidate_id, "parent_control_uuid": self.parent_control_uuid, "obligation_candidate_id": self.obligation_candidate_id, "title": self.title, "objective": self.objective, "requirements": self.requirements, "test_procedure": self.test_procedure, "evidence": self.evidence, "severity": self.severity, "category": self.category, "domain": self.domain, } # --------------------------------------------------------------------------- # Quality Gate # --------------------------------------------------------------------------- def quality_gate(candidate: ObligationCandidate) -> dict: """Validate an obligation candidate. Returns quality flags dict. Checks: has_normative_signal: text contains normative language single_action: only one main action (heuristic) not_rationale: not just a justification/reasoning not_evidence_only: not just an evidence requirement min_length: text is long enough to be meaningful has_parent_link: references back to parent control """ txt = candidate.obligation_text flags = {} # 1. Normative signal flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(txt)) # 2. Single action heuristic — count "und" / "and" / "sowie" splits # that connect different verbs (imperfect but useful) multi_verb_re = re.compile( r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren" r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b", re.IGNORECASE, ) flags["single_action"] = not bool(multi_verb_re.search(txt)) # 3. Not rationale normative_count = len(_NORMATIVE_RE.findall(txt)) rationale_count = len(_RATIONALE_RE.findall(txt)) flags["not_rationale"] = normative_count >= rationale_count # 4. Not evidence-only (evidence fragments are typically short noun phrases) evidence_only_re = re.compile( r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)", re.IGNORECASE, ) flags["not_evidence_only"] = not bool(evidence_only_re.match(txt.strip())) # 5. Min length flags["min_length"] = len(txt.strip()) >= 20 # 6. Parent link flags["has_parent_link"] = bool(candidate.parent_control_uuid) return flags def passes_quality_gate(flags: dict) -> bool: """Check if all critical quality flags pass.""" critical = ["has_normative_signal", "not_evidence_only", "min_length", "has_parent_link"] return all(flags.get(k, False) for k in critical) # --------------------------------------------------------------------------- # LLM Prompts # --------------------------------------------------------------------------- _PASS0A_SYSTEM_PROMPT = """\ Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \ in einzelne atomare Pflichten. REGELN (STRIKT EINHALTEN): 1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \ sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \ ist zu testen, shall, must, required. 2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung. 3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true). 4. Meldepflichten SEPARAT (is_reporting_obligation=true). 5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \ eigenes Control, sondern Evidence). 6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \ — NICHT extrahieren. Antworte NUR mit einem JSON-Array. Keine Erklärungen.""" def _build_pass0a_prompt( title: str, objective: str, requirements: str, test_procedure: str, source_ref: str ) -> str: return f"""\ Analysiere das folgende Control und extrahiere alle einzelnen normativen \ Pflichten als JSON-Array. CONTROL: Titel: {title} Ziel: {objective} Anforderungen: {requirements} Prüfverfahren: {test_procedure} Quellreferenz: {source_ref} Antworte als JSON-Array: [ {{ "obligation_text": "Kurze, präzise Formulierung der Pflicht", "action": "Hauptverb/Handlung", "object": "Gegenstand der Pflicht", "condition": "Auslöser/Bedingung oder null", "normative_strength": "must", "is_test_obligation": false, "is_reporting_obligation": false }} ]""" _PASS0B_SYSTEM_PROMPT = """\ Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \ normativen Pflicht ein praxisorientiertes, atomares Security Control. Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase. Antworte NUR als JSON. Keine Erklärungen.""" def _build_pass0b_prompt( obligation_text: str, action: str, object_: str, parent_title: str, parent_category: str, source_ref: str, ) -> str: return f"""\ Erstelle aus der folgenden Pflicht ein atomares Control. PFLICHT: {obligation_text} HANDLUNG: {action} GEGENSTAND: {object_} KONTEXT (Ursprungs-Control): Titel: {parent_title} Kategorie: {parent_category} Quellreferenz: {source_ref} Antworte als JSON: {{ "title": "Kurzer Titel (max 80 Zeichen, deutsch)", "objective": "Was muss erreicht werden? (1-2 Sätze)", "requirements": ["Konkrete Anforderung 1", "Anforderung 2"], "test_procedure": ["Prüfschritt 1", "Prüfschritt 2"], "evidence": ["Nachweis 1", "Nachweis 2"], "severity": "critical|high|medium|low", "category": "security|privacy|governance|operations|finance|reporting" }}""" # --------------------------------------------------------------------------- # Parse helpers # --------------------------------------------------------------------------- def _parse_json_array(text: str) -> list[dict]: """Extract a JSON array from LLM response text.""" # Try direct parse try: result = json.loads(text) if isinstance(result, list): return result if isinstance(result, dict): return [result] except json.JSONDecodeError: pass # Try extracting JSON array block match = re.search(r"\[[\s\S]*\]", text) if match: try: result = json.loads(match.group()) if isinstance(result, list): return result except json.JSONDecodeError: pass return [] def _parse_json_object(text: str) -> dict: """Extract a JSON object from LLM response text.""" try: result = json.loads(text) if isinstance(result, dict): return result except json.JSONDecodeError: pass match = re.search(r"\{[\s\S]*\}", text) if match: try: result = json.loads(match.group()) if isinstance(result, dict): return result except json.JSONDecodeError: pass return {} def _ensure_list(val) -> list: """Ensure value is a list.""" if isinstance(val, list): return val if isinstance(val, str): return [val] if val else [] return [] # --------------------------------------------------------------------------- # Decomposition Pass # --------------------------------------------------------------------------- class DecompositionPass: """Pass 0: Decompose Rich Controls into atomic candidates. Usage:: decomp = DecompositionPass(db=session) stats_0a = await decomp.run_pass0a(limit=100) stats_0b = await decomp.run_pass0b(limit=100) """ def __init__(self, db: Session): self.db = db # ------------------------------------------------------------------- # Pass 0a: Obligation Extraction # ------------------------------------------------------------------- async def run_pass0a(self, limit: int = 0) -> dict: """Extract obligation candidates from rich controls. Processes controls that have NOT been decomposed yet (no rows in obligation_candidates for that control). """ from compliance.services.obligation_extractor import _llm_ollama # Find rich controls not yet decomposed query = """ SELECT cc.id, cc.control_id, cc.title, cc.objective, cc.requirements, cc.test_procedure, cc.source_citation, cc.category FROM canonical_controls cc WHERE cc.release_state NOT IN ('deprecated') AND cc.parent_control_uuid IS NULL AND NOT EXISTS ( SELECT 1 FROM obligation_candidates oc WHERE oc.parent_control_uuid = cc.id ) ORDER BY cc.created_at """ if limit > 0: query += f" LIMIT {limit}" rows = self.db.execute(text(query)).fetchall() stats = { "controls_processed": 0, "obligations_extracted": 0, "obligations_validated": 0, "obligations_rejected": 0, "controls_skipped_empty": 0, "errors": 0, } for row in rows: control_uuid = str(row[0]) control_id = row[1] or "" title = row[2] or "" objective = row[3] or "" requirements = row[4] or "" test_procedure = row[5] or "" source_citation = row[6] or "" category = row[7] or "" # Format requirements/test_procedure if JSON req_str = _format_field(requirements) test_str = _format_field(test_procedure) source_str = _format_citation(source_citation) if not title and not objective and not req_str: stats["controls_skipped_empty"] += 1 continue try: prompt = _build_pass0a_prompt( title=title, objective=objective, requirements=req_str, test_procedure=test_str, source_ref=source_str, ) llm_response = await _llm_ollama( prompt=prompt, system_prompt=_PASS0A_SYSTEM_PROMPT, ) raw_obligations = _parse_json_array(llm_response) if not raw_obligations: # Fallback: treat the whole control as one obligation raw_obligations = [{ "obligation_text": objective or title, "action": "sicherstellen", "object": title, "condition": None, "normative_strength": "must", "is_test_obligation": False, "is_reporting_obligation": False, }] for idx, raw in enumerate(raw_obligations): cand = ObligationCandidate( candidate_id=f"OC-{control_id}-{idx + 1:02d}", parent_control_uuid=control_uuid, obligation_text=raw.get("obligation_text", ""), action=raw.get("action", ""), object_=raw.get("object", ""), condition=raw.get("condition"), normative_strength=raw.get("normative_strength", "must"), is_test_obligation=bool(raw.get("is_test_obligation", False)), is_reporting_obligation=bool(raw.get("is_reporting_obligation", False)), ) # Auto-detect test/reporting if LLM missed it if not cand.is_test_obligation and _TEST_RE.search(cand.obligation_text): cand.is_test_obligation = True if not cand.is_reporting_obligation and _REPORTING_RE.search(cand.obligation_text): cand.is_reporting_obligation = True # Quality gate flags = quality_gate(cand) cand.quality_flags = flags cand.extraction_confidence = _compute_extraction_confidence(flags) if passes_quality_gate(flags): cand.release_state = "validated" stats["obligations_validated"] += 1 else: cand.release_state = "rejected" stats["obligations_rejected"] += 1 # Write to DB self._write_obligation_candidate(cand) stats["obligations_extracted"] += 1 stats["controls_processed"] += 1 except Exception as e: logger.error("Pass 0a failed for %s: %s", control_id, e) stats["errors"] += 1 self.db.commit() logger.info("Pass 0a: %s", stats) return stats # ------------------------------------------------------------------- # Pass 0b: Atomic Control Composition # ------------------------------------------------------------------- async def run_pass0b(self, limit: int = 0) -> dict: """Compose atomic controls from validated obligation candidates. Processes obligation_candidates with release_state='validated' that don't have a corresponding atomic control yet. """ from compliance.services.obligation_extractor import _llm_ollama query = """ SELECT oc.id, oc.candidate_id, oc.parent_control_uuid, oc.obligation_text, oc.action, oc.object, oc.is_test_obligation, oc.is_reporting_obligation, cc.title AS parent_title, cc.category AS parent_category, cc.source_citation AS parent_citation, cc.severity AS parent_severity, cc.control_id AS parent_control_id FROM obligation_candidates oc JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid WHERE oc.release_state = 'validated' AND NOT EXISTS ( SELECT 1 FROM canonical_controls ac WHERE ac.parent_control_uuid = oc.parent_control_uuid AND ac.decomposition_method = 'pass0b' AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%' ) """ if limit > 0: query += f" LIMIT {limit}" rows = self.db.execute(text(query)).fetchall() stats = { "candidates_processed": 0, "controls_created": 0, "llm_failures": 0, "errors": 0, } for row in rows: oc_id = str(row[0]) candidate_id = row[1] or "" parent_uuid = str(row[2]) obligation_text = row[3] or "" action = row[4] or "" object_ = row[5] or "" is_test = row[6] is_reporting = row[7] parent_title = row[8] or "" parent_category = row[9] or "" parent_citation = row[10] or "" parent_severity = row[11] or "medium" parent_control_id = row[12] or "" source_str = _format_citation(parent_citation) try: prompt = _build_pass0b_prompt( obligation_text=obligation_text, action=action, object_=object_, parent_title=parent_title, parent_category=parent_category, source_ref=source_str, ) llm_response = await _llm_ollama( prompt=prompt, system_prompt=_PASS0B_SYSTEM_PROMPT, ) parsed = _parse_json_object(llm_response) if not parsed or not parsed.get("title"): # Template fallback — no LLM needed atomic = _template_fallback( obligation_text=obligation_text, action=action, object_=object_, parent_title=parent_title, parent_severity=parent_severity, parent_category=parent_category, is_test=is_test, is_reporting=is_reporting, ) stats["llm_failures"] += 1 else: atomic = AtomicControlCandidate( title=parsed.get("title", "")[:200], objective=parsed.get("objective", "")[:2000], requirements=_ensure_list(parsed.get("requirements", [])), test_procedure=_ensure_list(parsed.get("test_procedure", [])), evidence=_ensure_list(parsed.get("evidence", [])), severity=_normalize_severity(parsed.get("severity", parent_severity)), category=parsed.get("category", parent_category), ) atomic.parent_control_uuid = parent_uuid atomic.obligation_candidate_id = candidate_id # Generate control_id from parent seq = self._next_atomic_seq(parent_control_id) atomic.candidate_id = f"{parent_control_id}-A{seq:02d}" # Write to canonical_controls self._write_atomic_control(atomic, parent_uuid, candidate_id) # Mark obligation candidate as composed self.db.execute( text(""" UPDATE obligation_candidates SET release_state = 'composed' WHERE id = CAST(:oc_id AS uuid) """), {"oc_id": oc_id}, ) stats["controls_created"] += 1 stats["candidates_processed"] += 1 except Exception as e: logger.error("Pass 0b failed for %s: %s", candidate_id, e) stats["errors"] += 1 self.db.commit() logger.info("Pass 0b: %s", stats) return stats # ------------------------------------------------------------------- # Decomposition Status # ------------------------------------------------------------------- def decomposition_status(self) -> dict: """Return decomposition progress.""" row = self.db.execute(text(""" SELECT (SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NULL AND release_state NOT IN ('deprecated')) AS rich_controls, (SELECT count(DISTINCT parent_control_uuid) FROM obligation_candidates) AS decomposed_controls, (SELECT count(*) FROM obligation_candidates) AS total_candidates, (SELECT count(*) FROM obligation_candidates WHERE release_state = 'validated') AS validated, (SELECT count(*) FROM obligation_candidates WHERE release_state = 'rejected') AS rejected, (SELECT count(*) FROM obligation_candidates WHERE release_state = 'composed') AS composed, (SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NOT NULL) AS atomic_controls """)).fetchone() return { "rich_controls": row[0], "decomposed_controls": row[1], "total_candidates": row[2], "validated": row[3], "rejected": row[4], "composed": row[5], "atomic_controls": row[6], "decomposition_pct": round(row[1] / max(row[0], 1) * 100, 1), "composition_pct": round(row[5] / max(row[3], 1) * 100, 1), } # ------------------------------------------------------------------- # DB Writers # ------------------------------------------------------------------- def _write_obligation_candidate(self, cand: ObligationCandidate) -> None: """Insert an obligation candidate into the DB.""" self.db.execute( text(""" INSERT INTO obligation_candidates ( parent_control_uuid, candidate_id, obligation_text, action, object, condition, normative_strength, is_test_obligation, is_reporting_obligation, extraction_confidence, quality_flags, release_state ) VALUES ( CAST(:parent_uuid AS uuid), :candidate_id, :obligation_text, :action, :object, :condition, :normative_strength, :is_test, :is_reporting, :confidence, :quality_flags, :release_state ) """), { "parent_uuid": cand.parent_control_uuid, "candidate_id": cand.candidate_id, "obligation_text": cand.obligation_text, "action": cand.action, "object": cand.object_, "condition": cand.condition, "normative_strength": cand.normative_strength, "is_test": cand.is_test_obligation, "is_reporting": cand.is_reporting_obligation, "confidence": cand.extraction_confidence, "quality_flags": json.dumps(cand.quality_flags), "release_state": cand.release_state, }, ) def _write_atomic_control( self, atomic: AtomicControlCandidate, parent_uuid: str, candidate_id: str, ) -> None: """Insert an atomic control into canonical_controls.""" self.db.execute( text(""" INSERT INTO canonical_controls ( control_id, title, objective, requirements, test_procedure, evidence, severity, category, release_state, parent_control_uuid, decomposition_method, generation_metadata ) VALUES ( :control_id, :title, :objective, :requirements, :test_procedure, :evidence, :severity, :category, 'draft', CAST(:parent_uuid AS uuid), 'pass0b', :gen_meta ) """), { "control_id": atomic.candidate_id, "title": atomic.title, "objective": atomic.objective, "requirements": json.dumps(atomic.requirements), "test_procedure": json.dumps(atomic.test_procedure), "evidence": json.dumps(atomic.evidence), "severity": atomic.severity, "category": atomic.category, "parent_uuid": parent_uuid, "gen_meta": json.dumps({ "decomposition_source": candidate_id, "decomposition_method": "pass0b", }), }, ) def _next_atomic_seq(self, parent_control_id: str) -> int: """Get the next sequence number for atomic controls under a parent.""" result = self.db.execute( text(""" SELECT count(*) FROM canonical_controls WHERE parent_control_uuid = ( SELECT id FROM canonical_controls WHERE control_id = :parent_id LIMIT 1 ) """), {"parent_id": parent_control_id}, ).fetchone() return (result[0] if result else 0) + 1 # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _format_field(value) -> str: """Format a requirements/test_procedure field for the LLM prompt.""" if not value: return "" if isinstance(value, str): try: parsed = json.loads(value) if isinstance(parsed, list): return "\n".join(f"- {item}" for item in parsed) return value except (json.JSONDecodeError, TypeError): return value if isinstance(value, list): return "\n".join(f"- {item}" for item in value) return str(value) def _format_citation(citation) -> str: """Format source_citation for display.""" if not citation: return "" if isinstance(citation, str): try: c = json.loads(citation) if isinstance(c, dict): parts = [] if c.get("source"): parts.append(c["source"]) if c.get("article"): parts.append(c["article"]) if c.get("paragraph"): parts.append(c["paragraph"]) return " ".join(parts) if parts else citation except (json.JSONDecodeError, TypeError): return citation return str(citation) def _compute_extraction_confidence(flags: dict) -> float: """Compute confidence score from quality flags.""" score = 0.0 weights = { "has_normative_signal": 0.30, "single_action": 0.20, "not_rationale": 0.20, "not_evidence_only": 0.15, "min_length": 0.10, "has_parent_link": 0.05, } for flag, weight in weights.items(): if flags.get(flag, False): score += weight return round(score, 2) def _normalize_severity(val: str) -> str: """Normalize severity value.""" val = (val or "medium").lower().strip() if val in ("critical", "high", "medium", "low"): return val return "medium" def _template_fallback( obligation_text: str, action: str, object_: str, parent_title: str, parent_severity: str, parent_category: str, is_test: bool, is_reporting: bool, ) -> AtomicControlCandidate: """Create an atomic control candidate from template when LLM fails.""" if is_test: title = f"Test: {object_[:60]}" if object_ else f"Test: {action[:60]}" test_proc = [f"Prüfung der {object_ or action}"] evidence = ["Testprotokoll", "Prüfbericht"] elif is_reporting: title = f"Meldepflicht: {object_[:60]}" if object_ else f"Meldung: {action[:60]}" test_proc = ["Prüfung des Meldeprozesses", "Stichprobe gemeldeter Vorfälle"] evidence = ["Meldeprozess-Dokumentation", "Meldeformulare"] else: title = f"{action.capitalize()}: {object_[:60]}" if object_ else parent_title[:80] test_proc = [f"Prüfung der {action}"] evidence = ["Dokumentation", "Konfigurationsnachweis"] return AtomicControlCandidate( title=title[:200], objective=obligation_text[:2000], requirements=[obligation_text] if obligation_text else [], test_procedure=test_proc, evidence=evidence, severity=_normalize_severity(parent_severity), category=parent_category, )