feat(decomposition): add merge pass, enrichment, and Pass 0b refinements

Add obligation refinement pipeline between Pass 0a and 0b: - Merge pass: rule-based dedup of implementation-level duplicate obligations within the same parent control (Jaccard similarity on action+object) - Enrich pass: classify trigger_type (event/periodic/continuous) and detect is_implementation_specific from obligation text (regex-based, no LLM) - Pass 0b: skip merged obligations, cap severity for impl-specific, override category to 'testing' for test obligations - Migration 075: merged_into_id, trigger_type, is_implementation_specific - Two new API endpoints: merge-obligations, enrich-obligations - 30+ new tests (122 total, all passing) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 22:27:09 +01:00
parent 71b8c33270
commit a14e2f3a00
4 changed files with 804 additions and 12 deletions
@@ -126,6 +126,83 @@ _REPORTING_SIGNALS = [
 _REPORTING_RE = re.compile("|".join(_REPORTING_SIGNALS), re.IGNORECASE)


+# ---------------------------------------------------------------------------
+# Merge & Enrichment helpers
+# ---------------------------------------------------------------------------
+
+# Trigger-type detection patterns
+_EVENT_TRIGGERS = re.compile(
+    r"\b(vorfall|incident|breach|verletzung|sicherheitsvorfall|meldung|entdeckung"
+    r"|feststellung|erkennung|ereignis|eintritt|bei\s+auftreten|im\s+falle"
+    r"|wenn\s+ein|sobald|unverzüglich|upon|in\s+case\s+of|when\s+a)\b",
+    re.IGNORECASE,
+)
+_PERIODIC_TRIGGERS = re.compile(
+    r"\b(jährlich|monatlich|quartalsweise|regelmäßig|periodisch|annually"
+    r"|monthly|quarterly|periodic|mindestens\s+(einmal|alle)|turnusmäßig"
+    r"|wiederkehrend|in\s+regelmäßigen\s+abständen)\b",
+    re.IGNORECASE,
+)
+
+# Implementation-specific keywords (concrete tools/protocols/formats)
+_IMPL_SPECIFIC_PATTERNS = re.compile(
+    r"\b(TLS|SSL|AES|RSA|SHA-\d|HTTPS|LDAP|SAML|OAuth|OIDC|MFA|2FA"
+    r"|SIEM|IDS|IPS|WAF|VPN|VLAN|DMZ|HSM|PKI|RBAC|ABAC"
+    r"|ISO\s*27\d{3}|SOC\s*2|PCI[\s-]DSS|NIST"
+    r"|Firewall|Antivirus|EDR|XDR|SOAR|DLP"
+    r"|SMS|E-Mail|Fax|Telefon"
+    r"|JSON|XML|CSV|PDF|YAML"
+    r"|PostgreSQL|MySQL|MongoDB|Redis|Kafka"
+    r"|Docker|Kubernetes|AWS|Azure|GCP"
+    r"|Active\s*Directory|RADIUS|Kerberos"
+    r"|RSyslog|Splunk|ELK|Grafana|Prometheus"
+    r"|Git|Jenkins|Terraform|Ansible)\b",
+    re.IGNORECASE,
+)
+
+
+def _classify_trigger_type(obligation_text: str, condition: str) -> str:
+    """Classify when an obligation is triggered: event/periodic/continuous."""
+    combined = f"{obligation_text} {condition}"
+    if _EVENT_TRIGGERS.search(combined):
+        return "event"
+    if _PERIODIC_TRIGGERS.search(combined):
+        return "periodic"
+    return "continuous"
+
+
+def _is_implementation_specific_text(
+    obligation_text: str, action: str, obj: str
+) -> bool:
+    """Check if an obligation references concrete implementation details."""
+    combined = f"{obligation_text} {action} {obj}"
+    matches = _IMPL_SPECIFIC_PATTERNS.findall(combined)
+    return len(matches) >= 1
+
+
+def _text_similar(a: str, b: str, threshold: float = 0.75) -> bool:
+    """Quick token-overlap similarity check (Jaccard on words)."""
+    if not a or not b:
+        return False
+    tokens_a = set(a.split())
+    tokens_b = set(b.split())
+    if not tokens_a or not tokens_b:
+        return False
+    intersection = tokens_a & tokens_b
+    union = tokens_a | tokens_b
+    return len(intersection) / len(union) >= threshold
+
+
+def _is_more_implementation_specific(text_a: str, text_b: str) -> bool:
+    """Return True if text_a is more implementation-specific than text_b."""
+    matches_a = len(_IMPL_SPECIFIC_PATTERNS.findall(text_a))
+    matches_b = len(_IMPL_SPECIFIC_PATTERNS.findall(text_b))
+    if matches_a != matches_b:
+        return matches_a > matches_b
+    # Tie-break: longer text is usually more specific
+    return len(text_a) > len(text_b)
+
+
 # ---------------------------------------------------------------------------
 # Data classes
 # ---------------------------------------------------------------------------
@@ -864,12 +941,17 @@ class DecompositionPass:
                    )
                    stats["controls_processed"] += 1

+                # Commit after each successful sub-batch to avoid losing work
+                self.db.commit()
+
            except Exception as e:
                ids = ", ".join(c["control_id"] for c in batch)
                logger.error("Pass 0a failed for [%s]: %s", ids, e)
                stats["errors"] += 1
-
-        self.db.commit()
+                try:
+                    self.db.rollback()
+                except Exception:
+                    pass
        logger.info("Pass 0a: %s", stats)
        return stats

@@ -944,10 +1026,13 @@ class DecompositionPass:
                   cc.category AS parent_category,
                   cc.source_citation AS parent_citation,
                   cc.severity AS parent_severity,
-                   cc.control_id AS parent_control_id
+                   cc.control_id AS parent_control_id,
+                   oc.trigger_type,
+                   oc.is_implementation_specific
            FROM obligation_candidates oc
            JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid
            WHERE oc.release_state = 'validated'
+              AND oc.merged_into_id IS NULL
              AND NOT EXISTS (
                  SELECT 1 FROM canonical_controls ac
                  WHERE ac.parent_control_uuid = oc.parent_control_uuid
@@ -971,6 +1056,7 @@ class DecompositionPass:
            "dedup_enabled": self._dedup is not None,
            "dedup_linked": 0,
            "dedup_review": 0,
+            "skipped_merged": 0,
        }

        # Prepare obligation data
@@ -991,6 +1077,8 @@ class DecompositionPass:
                "parent_severity": row[11] or "medium",
                "parent_control_id": row[12] or "",
                "source_ref": _format_citation(row[10] or ""),
+                "trigger_type": row[13] or "continuous",
+                "is_implementation_specific": row[14] or False,
            })

        # Process in batches
@@ -1044,12 +1132,17 @@ class DecompositionPass:
                    parsed = _parse_json_object(llm_response)
                    await self._process_pass0b_control(obl, parsed, stats)

+                # Commit after each successful sub-batch
+                self.db.commit()
+
            except Exception as e:
                ids = ", ".join(o["candidate_id"] for o in batch)
                logger.error("Pass 0b failed for [%s]: %s", ids, e)
                stats["errors"] += 1
-
-        self.db.commit()
+                try:
+                    self.db.rollback()
+                except Exception:
+                    pass
        logger.info("Pass 0b: %s", stats)
        return stats

@@ -1090,6 +1183,16 @@ class DecompositionPass:
        atomic.parent_control_uuid = obl["parent_uuid"]
        atomic.obligation_candidate_id = obl["candidate_id"]

+        # Cap severity for implementation-specific obligations
+        if obl.get("is_implementation_specific") and atomic.severity in (
+            "critical", "high"
+        ):
+            atomic.severity = "medium"
+
+        # Override category for test obligations
+        if obl.get("is_test"):
+            atomic.category = "testing"
+
        # ── Dedup check (if enabled) ────────────────────────────
        if self._dedup:
            pattern_id = None
@@ -1182,6 +1285,150 @@ class DecompositionPass:
        stats["controls_created"] += 1
        stats["candidates_processed"] += 1

+    # -------------------------------------------------------------------
+    # Merge Pass: Deduplicate implementation-level obligations
+    # -------------------------------------------------------------------
+
+    def run_merge_pass(self) -> dict:
+        """Merge implementation-level duplicate obligations within each parent.
+
+        When the same parent control has multiple obligations with nearly
+        identical action+object (e.g. "SMS-Verbot" + "Policy-as-Code" both
+        implementing a communication restriction), keep the more abstract one
+        and mark the concrete one as merged.
+
+        No LLM calls — purely rule-based using text similarity.
+        """
+        stats = {
+            "parents_checked": 0,
+            "obligations_merged": 0,
+            "obligations_kept": 0,
+        }
+
+        # Get all parents that have >1 validated obligation
+        parents = self.db.execute(text("""
+            SELECT parent_control_uuid, count(*) AS cnt
+            FROM obligation_candidates
+            WHERE release_state = 'validated'
+              AND merged_into_id IS NULL
+            GROUP BY parent_control_uuid
+            HAVING count(*) > 1
+        """)).fetchall()
+
+        for parent_uuid, cnt in parents:
+            stats["parents_checked"] += 1
+            obligs = self.db.execute(text("""
+                SELECT id, candidate_id, obligation_text, action, object
+                FROM obligation_candidates
+                WHERE parent_control_uuid = CAST(:pid AS uuid)
+                  AND release_state = 'validated'
+                  AND merged_into_id IS NULL
+                ORDER BY created_at
+            """), {"pid": str(parent_uuid)}).fetchall()
+
+            merged_ids = set()
+            oblig_list = list(obligs)
+
+            for i in range(len(oblig_list)):
+                if str(oblig_list[i][0]) in merged_ids:
+                    continue
+                for j in range(i + 1, len(oblig_list)):
+                    if str(oblig_list[j][0]) in merged_ids:
+                        continue
+
+                    action_i = (oblig_list[i][3] or "").lower().strip()
+                    action_j = (oblig_list[j][3] or "").lower().strip()
+                    obj_i = (oblig_list[i][4] or "").lower().strip()
+                    obj_j = (oblig_list[j][4] or "").lower().strip()
+
+                    # Check if actions are similar enough to be duplicates
+                    if not _text_similar(action_i, action_j, threshold=0.75):
+                        continue
+                    if not _text_similar(obj_i, obj_j, threshold=0.60):
+                        continue
+
+                    # Keep the more abstract one (shorter text = less specific)
+                    text_i = oblig_list[i][2] or ""
+                    text_j = oblig_list[j][2] or ""
+                    if _is_more_implementation_specific(text_j, text_i):
+                        survivor_id = str(oblig_list[i][0])
+                        merged_id = str(oblig_list[j][0])
+                    else:
+                        survivor_id = str(oblig_list[j][0])
+                        merged_id = str(oblig_list[i][0])
+
+                    self.db.execute(text("""
+                        UPDATE obligation_candidates
+                        SET release_state = 'merged',
+                            merged_into_id = CAST(:survivor AS uuid)
+                        WHERE id = CAST(:merged AS uuid)
+                    """), {"survivor": survivor_id, "merged": merged_id})
+
+                    merged_ids.add(merged_id)
+                    stats["obligations_merged"] += 1
+
+            # Commit per parent to avoid large transactions
+            self.db.commit()
+
+        stats["obligations_kept"] = self.db.execute(text("""
+            SELECT count(*) FROM obligation_candidates
+            WHERE release_state = 'validated' AND merged_into_id IS NULL
+        """)).fetchone()[0]
+
+        logger.info("Merge pass: %s", stats)
+        return stats
+
+    # -------------------------------------------------------------------
+    # Enrich Pass: Add metadata to obligations
+    # -------------------------------------------------------------------
+
+    def enrich_obligations(self) -> dict:
+        """Add trigger_type and is_implementation_specific to obligations.
+
+        Rule-based enrichment — no LLM calls.
+        """
+        stats = {
+            "enriched": 0,
+            "trigger_event": 0,
+            "trigger_periodic": 0,
+            "trigger_continuous": 0,
+            "implementation_specific": 0,
+        }
+
+        obligs = self.db.execute(text("""
+            SELECT id, obligation_text, condition, action, object
+            FROM obligation_candidates
+            WHERE release_state = 'validated'
+              AND merged_into_id IS NULL
+              AND trigger_type IS NULL
+        """)).fetchall()
+
+        for row in obligs:
+            oc_id = str(row[0])
+            obl_text = row[1] or ""
+            condition = row[2] or ""
+            action = row[3] or ""
+            obj = row[4] or ""
+
+            trigger = _classify_trigger_type(obl_text, condition)
+            impl = _is_implementation_specific_text(obl_text, action, obj)
+
+            self.db.execute(text("""
+                UPDATE obligation_candidates
+                SET trigger_type = :trigger,
+                    is_implementation_specific = :impl
+                WHERE id = CAST(:oid AS uuid)
+            """), {"trigger": trigger, "impl": impl, "oid": oc_id})
+
+            stats["enriched"] += 1
+            stats[f"trigger_{trigger}"] += 1
+            if impl:
+                stats["implementation_specific"] += 1
+
+        self.db.commit()
+        logger.info("Enrich pass: %s", stats)
+        return stats
+
    # -------------------------------------------------------------------
    # Decomposition Status
    # -------------------------------------------------------------------
@@ -1198,9 +1445,13 @@ class DecompositionPass:
                (SELECT count(*) FROM obligation_candidates WHERE release_state = 'validated') AS validated,
                (SELECT count(*) FROM obligation_candidates WHERE release_state = 'rejected') AS rejected,
                (SELECT count(*) FROM obligation_candidates WHERE release_state = 'composed') AS composed,
-                (SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NOT NULL) AS atomic_controls
+                (SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NOT NULL) AS atomic_controls,
+                (SELECT count(*) FROM obligation_candidates WHERE release_state = 'merged') AS merged,
+                (SELECT count(*) FROM obligation_candidates WHERE trigger_type IS NOT NULL) AS enriched
        """)).fetchone()

+        validated_for_0b = row[3] - (row[7] or 0)  # validated minus merged
+
        return {
            "rich_controls": row[0],
            "decomposed_controls": row[1],
@@ -1209,8 +1460,11 @@ class DecompositionPass:
            "rejected": row[4],
            "composed": row[5],
            "atomic_controls": row[6],
+            "merged": row[7] or 0,
+            "enriched": row[8] or 0,
+            "ready_for_pass0b": validated_for_0b,
            "decomposition_pct": round(row[1] / max(row[0], 1) * 100, 1),
-            "composition_pct": round(row[5] / max(row[3], 1) * 100, 1),
+            "composition_pct": round(row[5] / max(validated_for_0b, 1) * 100, 1),
        }

    # -------------------------------------------------------------------