feat(rag): optimize RAG pipeline — JSON-Mode, CoT, Hybrid Search, Re-Ranking, Cross-Reg Dedup, chunk 1024

Phase 1 (LLM Quality): - Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill) - Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts Phase 2 (Retrieval Quality): - Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go) - Fallback to dense-only search if Query API unavailable - Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default) - CPU-only PyTorch dependency to keep Docker image small Phase 3 (Data Layer): - Cross-regulation dedup pass (threshold 0.95) links controls across regulations - DedupResult.link_type field distinguishes dedup_merge vs cross_regulation - Chunk size defaults updated 512/50 → 1024/128 for new ingestions only - Existing collections and controls are NOT affected Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:49:43 +01:00
parent c3a53fe5d2
commit c52dbdb8f1
24 changed files with 2620 additions and 139 deletions
@@ -69,7 +69,7 @@ class AnchorFinder:
        tags_str = " ".join(control.tags[:3]) if control.tags else ""
        query = f"{control.title} {tags_str}".strip()

-        results = await self.rag.search(
+        results = await self.rag.search_with_rerank(
            query=query,
            collection="bp_compliance_ce",
            top_k=15,
@@ -391,6 +391,7 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
        "model": OLLAMA_MODEL,
        "messages": messages,
        "stream": False,
+        "format": "json",
        "options": {"num_predict": 256},
        "think": False,
    }
@@ -0,0 +1,733 @@
+"""Control Deduplication Engine — 4-Stage Matching Pipeline.
+
+Prevents duplicate atomic controls during Pass 0b by checking candidates
+against existing controls before insertion.
+
+Stages:
+    1. Pattern-Gate:  pattern_id must match (hard gate)
+    2. Action-Check:  normalized action verb must match (hard gate)
+    3. Object-Norm:   normalized object must match (soft gate with high threshold)
+    4. Embedding:     cosine similarity with tiered thresholds (Qdrant)
+
+Verdicts:
+    - NEW:    create a new atomic control
+    - LINK:   add parent link to existing control (similarity > LINK_THRESHOLD)
+    - REVIEW: queue for human review (REVIEW_THRESHOLD < sim < LINK_THRESHOLD)
+"""
+
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from typing import Optional, Callable, Awaitable
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+# ── Configuration ────────────────────────────────────────────────────
+
+DEDUP_ENABLED = os.getenv("DEDUP_ENABLED", "true").lower() == "true"
+LINK_THRESHOLD = float(os.getenv("DEDUP_LINK_THRESHOLD", "0.92"))
+REVIEW_THRESHOLD = float(os.getenv("DEDUP_REVIEW_THRESHOLD", "0.85"))
+LINK_THRESHOLD_DIFF_OBJECT = float(os.getenv("DEDUP_LINK_THRESHOLD_DIFF_OBJ", "0.95"))
+CROSS_REG_LINK_THRESHOLD = float(os.getenv("DEDUP_CROSS_REG_THRESHOLD", "0.95"))
+QDRANT_COLLECTION = os.getenv("DEDUP_QDRANT_COLLECTION", "atomic_controls")
+QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
+EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
+
+
+# ── Result Dataclass ─────────────────────────────────────────────────
+
+@dataclass
+class DedupResult:
+    """Outcome of the dedup check."""
+    verdict: str  # "new" | "link" | "review"
+    matched_control_uuid: Optional[str] = None
+    matched_control_id: Optional[str] = None
+    matched_title: Optional[str] = None
+    stage: str = ""  # which stage decided
+    similarity_score: float = 0.0
+    link_type: str = "dedup_merge"  # "dedup_merge" | "cross_regulation"
+    details: dict = field(default_factory=dict)
+
+
+# ── Action Normalization ─────────────────────────────────────────────
+
+_ACTION_SYNONYMS: dict[str, str] = {
+    # German → canonical English
+    "implementieren": "implement",
+    "umsetzen": "implement",
+    "einrichten": "implement",
+    "einführen": "implement",
+    "aufbauen": "implement",
+    "bereitstellen": "implement",
+    "aktivieren": "implement",
+    "konfigurieren": "configure",
+    "einstellen": "configure",
+    "parametrieren": "configure",
+    "testen": "test",
+    "prüfen": "test",
+    "überprüfen": "test",
+    "verifizieren": "test",
+    "validieren": "test",
+    "kontrollieren": "test",
+    "auditieren": "audit",
+    "dokumentieren": "document",
+    "protokollieren": "log",
+    "aufzeichnen": "log",
+    "loggen": "log",
+    "überwachen": "monitor",
+    "monitoring": "monitor",
+    "beobachten": "monitor",
+    "schulen": "train",
+    "trainieren": "train",
+    "sensibilisieren": "train",
+    "löschen": "delete",
+    "entfernen": "delete",
+    "verschlüsseln": "encrypt",
+    "sperren": "block",
+    "beschränken": "restrict",
+    "einschränken": "restrict",
+    "begrenzen": "restrict",
+    "autorisieren": "authorize",
+    "genehmigen": "authorize",
+    "freigeben": "authorize",
+    "authentifizieren": "authenticate",
+    "identifizieren": "identify",
+    "melden": "report",
+    "benachrichtigen": "notify",
+    "informieren": "notify",
+    "aktualisieren": "update",
+    "erneuern": "update",
+    "sichern": "backup",
+    "wiederherstellen": "restore",
+    # English passthrough
+    "implement": "implement",
+    "configure": "configure",
+    "test": "test",
+    "verify": "test",
+    "validate": "test",
+    "audit": "audit",
+    "document": "document",
+    "log": "log",
+    "monitor": "monitor",
+    "train": "train",
+    "delete": "delete",
+    "encrypt": "encrypt",
+    "restrict": "restrict",
+    "authorize": "authorize",
+    "authenticate": "authenticate",
+    "report": "report",
+    "update": "update",
+    "backup": "backup",
+    "restore": "restore",
+}
+
+
+def normalize_action(action: str) -> str:
+    """Normalize an action verb to a canonical English form."""
+    if not action:
+        return ""
+    action = action.strip().lower()
+    # Strip German infinitive/conjugation suffixes for lookup
+    action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
+    # Try exact match first, then base form
+    if action in _ACTION_SYNONYMS:
+        return _ACTION_SYNONYMS[action]
+    if action_base in _ACTION_SYNONYMS:
+        return _ACTION_SYNONYMS[action_base]
+    # Fuzzy: check if action starts with any known verb
+    for verb, canonical in _ACTION_SYNONYMS.items():
+        if action.startswith(verb) or verb.startswith(action):
+            return canonical
+    return action  # fallback: return as-is
+
+
+# ── Object Normalization ─────────────────────────────────────────────
+
+_OBJECT_SYNONYMS: dict[str, str] = {
+    # Authentication / Access
+    "mfa": "multi_factor_auth",
+    "multi-faktor-authentifizierung": "multi_factor_auth",
+    "mehrfaktorauthentifizierung": "multi_factor_auth",
+    "multi-factor authentication": "multi_factor_auth",
+    "two-factor": "multi_factor_auth",
+    "2fa": "multi_factor_auth",
+    "passwort": "password_policy",
+    "kennwort": "password_policy",
+    "password": "password_policy",
+    "zugangsdaten": "credentials",
+    "credentials": "credentials",
+    "admin-konten": "privileged_access",
+    "admin accounts": "privileged_access",
+    "administratorkonten": "privileged_access",
+    "privilegierte zugriffe": "privileged_access",
+    "privileged accounts": "privileged_access",
+    "remote-zugriff": "remote_access",
+    "fernzugriff": "remote_access",
+    "remote access": "remote_access",
+    "session": "session_management",
+    "sitzung": "session_management",
+    "sitzungsverwaltung": "session_management",
+    # Encryption
+    "verschlüsselung": "encryption",
+    "encryption": "encryption",
+    "kryptografie": "encryption",
+    "kryptografische verfahren": "encryption",
+    "schlüssel": "key_management",
+    "key management": "key_management",
+    "schlüsselverwaltung": "key_management",
+    "zertifikat": "certificate_management",
+    "certificate": "certificate_management",
+    "tls": "transport_encryption",
+    "ssl": "transport_encryption",
+    "https": "transport_encryption",
+    # Network
+    "firewall": "firewall",
+    "netzwerk": "network_security",
+    "network": "network_security",
+    "vpn": "vpn",
+    "segmentierung": "network_segmentation",
+    "segmentation": "network_segmentation",
+    # Logging / Monitoring
+    "audit-log": "audit_logging",
+    "audit log": "audit_logging",
+    "protokoll": "audit_logging",
+    "logging": "audit_logging",
+    "monitoring": "monitoring",
+    "überwachung": "monitoring",
+    "alerting": "alerting",
+    "alarmierung": "alerting",
+    "siem": "siem",
+    # Data
+    "personenbezogene daten": "personal_data",
+    "personal data": "personal_data",
+    "sensible daten": "sensitive_data",
+    "sensitive data": "sensitive_data",
+    "datensicherung": "backup",
+    "backup": "backup",
+    "wiederherstellung": "disaster_recovery",
+    "disaster recovery": "disaster_recovery",
+    # Policy / Process
+    "richtlinie": "policy",
+    "policy": "policy",
+    "verfahrensanweisung": "procedure",
+    "procedure": "procedure",
+    "prozess": "process",
+    "schulung": "training",
+    "training": "training",
+    "awareness": "awareness",
+    "sensibilisierung": "awareness",
+    # Incident
+    "vorfall": "incident",
+    "incident": "incident",
+    "sicherheitsvorfall": "security_incident",
+    "security incident": "security_incident",
+    # Vulnerability
+    "schwachstelle": "vulnerability",
+    "vulnerability": "vulnerability",
+    "patch": "patch_management",
+    "update": "patch_management",
+    "patching": "patch_management",
+}
+
+# Precompile for substring matching (longest first)
+_OBJECT_KEYS_SORTED = sorted(_OBJECT_SYNONYMS.keys(), key=len, reverse=True)
+
+
+def normalize_object(obj: str) -> str:
+    """Normalize a compliance object to a canonical token."""
+    if not obj:
+        return ""
+    obj_lower = obj.strip().lower()
+    # Exact match
+    if obj_lower in _OBJECT_SYNONYMS:
+        return _OBJECT_SYNONYMS[obj_lower]
+    # Substring match (longest first)
+    for phrase in _OBJECT_KEYS_SORTED:
+        if phrase in obj_lower:
+            return _OBJECT_SYNONYMS[phrase]
+    # Fallback: strip articles/prepositions, join with underscore
+    cleaned = re.sub(r"\b(der|die|das|den|dem|des|ein|eine|eines|einem|einen"
+                     r"|für|von|zu|auf|in|an|bei|mit|nach|über|unter|the|a|an"
+                     r"|for|of|to|on|in|at|by|with)\b", "", obj_lower)
+    tokens = [t for t in cleaned.split() if len(t) > 2]
+    return "_".join(tokens[:4]) if tokens else obj_lower.replace(" ", "_")
+
+
+# ── Canonicalization ─────────────────────────────────────────────────
+
+def canonicalize_text(action: str, obj: str, title: str = "") -> str:
+    """Build a canonical English text for embedding.
+
+    Transforms German compliance text into normalized English tokens
+    for more stable embedding comparisons.
+    """
+    norm_action = normalize_action(action)
+    norm_object = normalize_object(obj)
+    # Build canonical sentence
+    parts = [norm_action, norm_object]
+    if title:
+        # Add title keywords (stripped of common filler)
+        title_clean = re.sub(
+            r"\b(und|oder|für|von|zu|der|die|das|den|dem|des|ein|eine"
+            r"|bei|mit|nach|gemäß|gem\.|laut|entsprechend)\b",
+            "", title.lower()
+        )
+        title_tokens = [t for t in title_clean.split() if len(t) > 3][:5]
+        if title_tokens:
+            parts.append("for")
+            parts.extend(title_tokens)
+    return " ".join(parts)
+
+
+# ── Embedding Helper ─────────────────────────────────────────────────
+
+async def get_embedding(text: str) -> list[float]:
+    """Get embedding vector for a single text via embedding service."""
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.post(
+                f"{EMBEDDING_URL}/embed",
+                json={"texts": [text]},
+            )
+            embeddings = resp.json().get("embeddings", [])
+            return embeddings[0] if embeddings else []
+    except Exception as e:
+        logger.warning("Embedding failed: %s", e)
+        return []
+
+
+def cosine_similarity(a: list[float], b: list[float]) -> float:
+    """Compute cosine similarity between two vectors."""
+    if not a or not b or len(a) != len(b):
+        return 0.0
+    dot = sum(x * y for x, y in zip(a, b))
+    norm_a = sum(x * x for x in a) ** 0.5
+    norm_b = sum(x * x for x in b) ** 0.5
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return dot / (norm_a * norm_b)
+
+
+# ── Qdrant Helpers ───────────────────────────────────────────────────
+
+async def qdrant_search(
+    embedding: list[float],
+    pattern_id: str,
+    top_k: int = 10,
+) -> list[dict]:
+    """Search Qdrant for similar atomic controls, filtered by pattern_id."""
+    if not embedding:
+        return []
+    body: dict = {
+        "vector": embedding,
+        "limit": top_k,
+        "with_payload": True,
+        "filter": {
+            "must": [
+                {"key": "pattern_id", "match": {"value": pattern_id}}
+            ]
+        },
+    }
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.post(
+                f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
+                json=body,
+            )
+            if resp.status_code != 200:
+                logger.warning("Qdrant search failed: %d", resp.status_code)
+                return []
+            return resp.json().get("result", [])
+    except Exception as e:
+        logger.warning("Qdrant search error: %s", e)
+        return []
+
+
+async def qdrant_search_cross_regulation(
+    embedding: list[float],
+    top_k: int = 5,
+) -> list[dict]:
+    """Search Qdrant for similar controls across ALL regulations (no pattern_id filter).
+
+    Used for cross-regulation linking (e.g. DSGVO Art. 25 ↔ NIS2 Art. 21).
+    """
+    if not embedding:
+        return []
+    body: dict = {
+        "vector": embedding,
+        "limit": top_k,
+        "with_payload": True,
+    }
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.post(
+                f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
+                json=body,
+            )
+            if resp.status_code != 200:
+                logger.warning("Qdrant cross-reg search failed: %d", resp.status_code)
+                return []
+            return resp.json().get("result", [])
+    except Exception as e:
+        logger.warning("Qdrant cross-reg search error: %s", e)
+        return []
+
+
+async def qdrant_upsert(
+    point_id: str,
+    embedding: list[float],
+    payload: dict,
+) -> bool:
+    """Upsert a single point into the atomic_controls Qdrant collection."""
+    if not embedding:
+        return False
+    body = {
+        "points": [{
+            "id": point_id,
+            "vector": embedding,
+            "payload": payload,
+        }]
+    }
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.put(
+                f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points",
+                json=body,
+            )
+            return resp.status_code == 200
+    except Exception as e:
+        logger.warning("Qdrant upsert error: %s", e)
+        return False
+
+
+async def ensure_qdrant_collection(vector_size: int = 1024) -> bool:
+    """Create the Qdrant collection if it doesn't exist (idempotent)."""
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            # Check if exists
+            resp = await client.get(f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}")
+            if resp.status_code == 200:
+                return True
+            # Create
+            resp = await client.put(
+                f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}",
+                json={
+                    "vectors": {"size": vector_size, "distance": "Cosine"},
+                },
+            )
+            if resp.status_code == 200:
+                logger.info("Created Qdrant collection: %s", QDRANT_COLLECTION)
+                # Create payload indexes
+                for field_name in ["pattern_id", "action_normalized", "object_normalized", "control_id"]:
+                    await client.put(
+                        f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/index",
+                        json={"field_name": field_name, "field_schema": "keyword"},
+                    )
+                return True
+            logger.error("Failed to create Qdrant collection: %d", resp.status_code)
+            return False
+    except Exception as e:
+        logger.warning("Qdrant collection check error: %s", e)
+        return False
+
+
+# ── Main Dedup Checker ───────────────────────────────────────────────
+
+class ControlDedupChecker:
+    """4-stage dedup checker for atomic controls.
+
+    Usage:
+        checker = ControlDedupChecker(db_session)
+        result = await checker.check_duplicate(candidate_action, candidate_object, candidate_title, pattern_id)
+        if result.verdict == "link":
+            checker.add_parent_link(result.matched_control_uuid, parent_uuid)
+        elif result.verdict == "review":
+            checker.write_review(candidate, result)
+        else:
+            # Insert new control
+    """
+
+    def __init__(
+        self,
+        db,
+        embed_fn: Optional[Callable[[str], Awaitable[list[float]]]] = None,
+        search_fn: Optional[Callable] = None,
+    ):
+        self.db = db
+        self._embed = embed_fn or get_embedding
+        self._search = search_fn or qdrant_search
+        self._cache: dict[str, list[dict]] = {}  # pattern_id → existing controls
+
+    def _load_existing(self, pattern_id: str) -> list[dict]:
+        """Load existing atomic controls with same pattern_id from DB."""
+        if pattern_id in self._cache:
+            return self._cache[pattern_id]
+        from sqlalchemy import text
+        rows = self.db.execute(text("""
+            SELECT id::text, control_id, title, objective,
+                   pattern_id,
+                   generation_metadata->>'obligation_type' as obligation_type
+            FROM canonical_controls
+            WHERE parent_control_uuid IS NOT NULL
+              AND release_state != 'deprecated'
+              AND pattern_id = :pid
+        """), {"pid": pattern_id}).fetchall()
+        result = [
+            {
+                "uuid": r[0], "control_id": r[1], "title": r[2],
+                "objective": r[3], "pattern_id": r[4],
+                "obligation_type": r[5],
+            }
+            for r in rows
+        ]
+        self._cache[pattern_id] = result
+        return result
+
+    async def check_duplicate(
+        self,
+        action: str,
+        obj: str,
+        title: str,
+        pattern_id: Optional[str],
+    ) -> DedupResult:
+        """Run the 4-stage dedup pipeline + cross-regulation linking.
+
+        Returns DedupResult with verdict: new/link/review.
+        """
+        # No pattern_id → can't dedup meaningfully
+        if not pattern_id:
+            return DedupResult(verdict="new", stage="no_pattern")
+
+        # Stage 1: Pattern-Gate
+        existing = self._load_existing(pattern_id)
+        if not existing:
+            return DedupResult(
+                verdict="new", stage="pattern_gate",
+                details={"reason": "no existing controls with this pattern_id"},
+            )
+
+        # Stage 2: Action-Check
+        norm_action = normalize_action(action)
+        # We don't have action stored on existing controls from DB directly,
+        # so we use embedding for controls that passed pattern gate.
+        # But we CAN check via generation_metadata if available.
+
+        # Stage 3: Object-Normalization
+        norm_object = normalize_object(obj)
+
+        # Stage 4: Embedding Similarity
+        canonical = canonicalize_text(action, obj, title)
+        embedding = await self._embed(canonical)
+        if not embedding:
+            # Can't compute embedding → default to new
+            return DedupResult(
+                verdict="new", stage="embedding_unavailable",
+                details={"canonical_text": canonical},
+            )
+
+        # Search Qdrant
+        results = await self._search(embedding, pattern_id, top_k=5)
+
+        if not results:
+            # No intra-pattern matches → try cross-regulation
+            return await self._check_cross_regulation(embedding, DedupResult(
+                verdict="new", stage="no_qdrant_matches",
+                details={"canonical_text": canonical, "action": norm_action, "object": norm_object},
+            ))
+
+        # Evaluate best match
+        best = results[0]
+        best_score = best.get("score", 0.0)
+        best_payload = best.get("payload", {})
+        best_action = best_payload.get("action_normalized", "")
+        best_object = best_payload.get("object_normalized", "")
+
+        # Action differs → NEW (even if embedding is high)
+        if best_action and norm_action and best_action != norm_action:
+            return await self._check_cross_regulation(embedding, DedupResult(
+                verdict="new", stage="action_mismatch",
+                similarity_score=best_score,
+                matched_control_id=best_payload.get("control_id"),
+                details={
+                    "candidate_action": norm_action,
+                    "existing_action": best_action,
+                    "similarity": best_score,
+                },
+            ))
+
+        # Object differs → use higher threshold
+        if best_object and norm_object and best_object != norm_object:
+            if best_score > LINK_THRESHOLD_DIFF_OBJECT:
+                return DedupResult(
+                    verdict="link", stage="embedding_diff_object",
+                    matched_control_uuid=best_payload.get("control_uuid"),
+                    matched_control_id=best_payload.get("control_id"),
+                    matched_title=best_payload.get("title"),
+                    similarity_score=best_score,
+                    details={"candidate_object": norm_object, "existing_object": best_object},
+                )
+            return await self._check_cross_regulation(embedding, DedupResult(
+                verdict="new", stage="object_mismatch_below_threshold",
+                similarity_score=best_score,
+                matched_control_id=best_payload.get("control_id"),
+                details={
+                    "candidate_object": norm_object,
+                    "existing_object": best_object,
+                    "threshold": LINK_THRESHOLD_DIFF_OBJECT,
+                },
+            ))
+
+        # Same action + same object → tiered thresholds
+        if best_score > LINK_THRESHOLD:
+            return DedupResult(
+                verdict="link", stage="embedding_match",
+                matched_control_uuid=best_payload.get("control_uuid"),
+                matched_control_id=best_payload.get("control_id"),
+                matched_title=best_payload.get("title"),
+                similarity_score=best_score,
+            )
+        if best_score > REVIEW_THRESHOLD:
+            return DedupResult(
+                verdict="review", stage="embedding_review",
+                matched_control_uuid=best_payload.get("control_uuid"),
+                matched_control_id=best_payload.get("control_id"),
+                matched_title=best_payload.get("title"),
+                similarity_score=best_score,
+            )
+        return await self._check_cross_regulation(embedding, DedupResult(
+            verdict="new", stage="embedding_below_threshold",
+            similarity_score=best_score,
+            details={"threshold": REVIEW_THRESHOLD},
+        ))
+
+    async def _check_cross_regulation(
+        self,
+        embedding: list[float],
+        intra_result: DedupResult,
+    ) -> DedupResult:
+        """Second pass: cross-regulation linking for controls deemed 'new'.
+
+        Searches Qdrant WITHOUT pattern_id filter. Uses a higher threshold
+        (0.95) to avoid false positives across regulation boundaries.
+        """
+        if intra_result.verdict != "new" or not embedding:
+            return intra_result
+
+        cross_results = await qdrant_search_cross_regulation(embedding, top_k=5)
+        if not cross_results:
+            return intra_result
+
+        best = cross_results[0]
+        best_score = best.get("score", 0.0)
+        if best_score > CROSS_REG_LINK_THRESHOLD:
+            best_payload = best.get("payload", {})
+            return DedupResult(
+                verdict="link",
+                stage="cross_regulation",
+                matched_control_uuid=best_payload.get("control_uuid"),
+                matched_control_id=best_payload.get("control_id"),
+                matched_title=best_payload.get("title"),
+                similarity_score=best_score,
+                link_type="cross_regulation",
+                details={
+                    "cross_reg_score": best_score,
+                    "cross_reg_threshold": CROSS_REG_LINK_THRESHOLD,
+                },
+            )
+
+        return intra_result
+
+    def add_parent_link(
+        self,
+        control_uuid: str,
+        parent_control_uuid: str,
+        link_type: str = "dedup_merge",
+        confidence: float = 0.0,
+        source_regulation: Optional[str] = None,
+        source_article: Optional[str] = None,
+        obligation_candidate_id: Optional[str] = None,
+    ) -> None:
+        """Add a parent link to an existing atomic control."""
+        from sqlalchemy import text
+        self.db.execute(text("""
+            INSERT INTO control_parent_links
+                (control_uuid, parent_control_uuid, link_type, confidence,
+                 source_regulation, source_article, obligation_candidate_id)
+            VALUES (:cu, :pu, :lt, :conf, :sr, :sa, :oci::uuid)
+            ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
+        """), {
+            "cu": control_uuid,
+            "pu": parent_control_uuid,
+            "lt": link_type,
+            "conf": confidence,
+            "sr": source_regulation,
+            "sa": source_article,
+            "oci": obligation_candidate_id,
+        })
+        self.db.commit()
+
+    def write_review(
+        self,
+        candidate_control_id: str,
+        candidate_title: str,
+        candidate_objective: str,
+        result: DedupResult,
+        parent_control_uuid: Optional[str] = None,
+        obligation_candidate_id: Optional[str] = None,
+    ) -> None:
+        """Write a dedup review queue entry."""
+        from sqlalchemy import text
+        self.db.execute(text("""
+            INSERT INTO control_dedup_reviews
+                (candidate_control_id, candidate_title, candidate_objective,
+                 matched_control_uuid, matched_control_id,
+                 similarity_score, dedup_stage, dedup_details,
+                 parent_control_uuid, obligation_candidate_id)
+            VALUES (:ccid, :ct, :co, :mcu::uuid, :mci, :ss, :ds,
+                    :dd::jsonb, :pcu::uuid, :oci)
+        """), {
+            "ccid": candidate_control_id,
+            "ct": candidate_title,
+            "co": candidate_objective,
+            "mcu": result.matched_control_uuid,
+            "mci": result.matched_control_id,
+            "ss": result.similarity_score,
+            "ds": result.stage,
+            "dd": __import__("json").dumps(result.details),
+            "pcu": parent_control_uuid,
+            "oci": obligation_candidate_id,
+        })
+        self.db.commit()
+
+    async def index_control(
+        self,
+        control_uuid: str,
+        control_id: str,
+        title: str,
+        action: str,
+        obj: str,
+        pattern_id: str,
+    ) -> bool:
+        """Index a new atomic control in Qdrant for future dedup checks."""
+        norm_action = normalize_action(action)
+        norm_object = normalize_object(obj)
+        canonical = canonicalize_text(action, obj, title)
+        embedding = await self._embed(canonical)
+        if not embedding:
+            return False
+        return await qdrant_upsert(
+            point_id=control_uuid,
+            embedding=embedding,
+            payload={
+                "control_uuid": control_uuid,
+                "control_id": control_id,
+                "title": title,
+                "pattern_id": pattern_id,
+                "action_normalized": norm_action,
+                "object_normalized": norm_object,
+                "canonical_text": canonical,
+            },
+        )
@@ -75,12 +75,12 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
    # RULE 1: FREE USE — Laws, Public Domain
    # source_type: "law" = binding legislation, "guideline" = authority guidance (soft law),
    #              "standard" = voluntary framework/best practice, "restricted" = protected norm
-    # EU Regulations
-    "eu_2016_679":           {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "DSGVO"},
-    "eu_2024_1689":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "AI Act (KI-Verordnung)"},
-    "eu_2022_2555":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "NIS2"},
+    # EU Regulations — names MUST match canonical DB source names
+    "eu_2016_679":           {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "DSGVO (EU) 2016/679"},
+    "eu_2024_1689":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "KI-Verordnung (EU) 2024/1689"},
+    "eu_2022_2555":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "NIS2-Richtlinie (EU) 2022/2555"},
    "eu_2024_2847":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Cyber Resilience Act (CRA)"},
-    "eu_2023_1230":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Maschinenverordnung"},
+    "eu_2023_1230":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Maschinenverordnung (EU) 2023/1230"},
    "eu_2022_2065":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Digital Services Act (DSA)"},
    "eu_2022_1925":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Digital Markets Act (DMA)"},
    "eu_2022_868":           {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Data Governance Act (DGA)"},
@@ -88,52 +88,52 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
    "eu_2021_914":           {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Standardvertragsklauseln (SCC)"},
    "eu_2002_58":            {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "ePrivacy-Richtlinie"},
    "eu_2000_31":            {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "E-Commerce-Richtlinie"},
-    "eu_2023_1803":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "IFRS-Uebernahmeverordnung"},
+    "eu_2023_1803":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "IFRS-Übernahmeverordnung"},
    "eucsa":                 {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "EU Cybersecurity Act"},
    "dataact":               {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Data Act"},
    "dora":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Digital Operational Resilience Act"},
    "ehds":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "European Health Data Space"},
    "gpsr":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung"},
    "eu_2023_988":           {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung (GPSR)"},
-    "eu_2023_1542":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Batterieverordnung"},
-    "mica":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Markets in Crypto-Assets"},
+    "eu_2023_1542":          {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Batterieverordnung (EU) 2023/1542"},
+    "mica":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Markets in Crypto-Assets (MiCA)"},
    "psd2":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Zahlungsdiensterichtlinie 2"},
    "dpf":                   {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "EU-US Data Privacy Framework"},
    "dsm":                   {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "DSM-Urheberrechtsrichtlinie"},
    "amlr":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "AML-Verordnung"},
-    "eu_blue_guide_2022":    {"license": "EU_PUBLIC",           "rule": 1, "source_type": "guideline", "name": "Blue Guide 2022"},
+    "eu_blue_guide_2022":    {"license": "EU_PUBLIC",           "rule": 1, "source_type": "guideline", "name": "EU Blue Guide 2022"},
    # NIST (Public Domain — NOT laws, voluntary standards)
-    "nist_sp_800_53":        {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-53"},
-    "nist_sp800_53r5":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev.5"},
-    "nist_sp_800_63b":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-63B"},
+    "nist_sp_800_53":        {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev. 5"},
+    "nist_sp800_53r5":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev. 5"},
+    "nist_sp_800_63b":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-63-3"},
    "nist_sp800_63_3":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-63-3"},
-    "nist_csf_2_0":          {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST CSF 2.0"},
-    "nist_sp_800_218":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SSDF"},
-    "nist_sp800_218":        {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SSDF"},
-    "nist_sp800_207":        {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-207 Zero Trust"},
+    "nist_csf_2_0":          {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST Cybersecurity Framework 2.0"},
+    "nist_sp_800_218":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-218 (SSDF)"},
+    "nist_sp800_218":        {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-218 (SSDF)"},
+    "nist_sp800_207":        {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST SP 800-207 (Zero Trust)"},
    "nist_ai_rmf":           {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST AI Risk Management Framework"},
    "nist_privacy_1_0":      {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NIST Privacy Framework 1.0"},
    "nistir_8259a":          {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "source_type": "standard", "name": "NISTIR 8259A IoT Security"},
    "cisa_secure_by_design": {"license": "US_GOV_PUBLIC",       "rule": 1, "source_type": "standard", "name": "CISA Secure by Design"},
    # German Laws
-    "bdsg":                  {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "BDSG"},
-    "bdsg_2018_komplett":    {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "BDSG 2018"},
+    "bdsg":                  {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Bundesdatenschutzgesetz (BDSG)"},
+    "bdsg_2018_komplett":    {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Bundesdatenschutzgesetz (BDSG)"},
    "ttdsg":                 {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "TTDSG"},
    "tdddg_25":              {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "TDDDG"},
    "tkg":                   {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "TKG"},
    "de_tkg":                {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "TKG"},
    "bgb_komplett":          {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "BGB"},
-    "hgb":                   {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "HGB"},
-    "hgb_komplett":          {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "HGB"},
+    "hgb":                   {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Handelsgesetzbuch (HGB)"},
+    "hgb_komplett":          {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Handelsgesetzbuch (HGB)"},
    "urhg_komplett":         {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "UrhG"},
    "uwg":                   {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "UWG"},
    "tmg_komplett":          {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "TMG"},
-    "gewo":                  {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "GewO"},
-    "ao":                    {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Abgabenordnung"},
-    "ao_komplett":           {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Abgabenordnung"},
+    "gewo":                  {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Gewerbeordnung (GewO)"},
+    "ao":                    {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
+    "ao_komplett":           {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
    "battdg":                {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Batteriegesetz"},
    # Austrian Laws
-    "at_dsg":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT DSG"},
+    "at_dsg":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "Österreichisches Datenschutzgesetz (DSG)"},
    "at_abgb":               {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT ABGB"},
    "at_abgb_agb":           {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT ABGB AGB-Recht"},
    "at_bao":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT BAO"},
@@ -141,7 +141,7 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
    "at_ecg":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT E-Commerce-Gesetz"},
    "at_kschg":              {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT Konsumentenschutzgesetz"},
    "at_medieng":            {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT Mediengesetz"},
-    "at_tkg":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT TKG"},
+    "at_tkg":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "Telekommunikationsgesetz Oesterreich"},
    "at_ugb":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT UGB"},
    "at_ugb_ret":            {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT UGB Retention"},
    "at_uwg":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT UWG"},
@@ -179,21 +179,21 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
    "wp260_transparency":    {"license": "EU_PUBLIC",           "rule": 1, "source_type": "guideline", "name": "WP29 Transparency"},

    # RULE 2: CITATION REQUIRED — CC-BY, CC-BY-SA (voluntary standards)
-    "owasp_asvs":            {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP ASVS",
+    "owasp_asvs":            {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP ASVS 4.0",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
-    "owasp_masvs":           {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP MASVS",
+    "owasp_masvs":           {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP MASVS 2.0",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
-    "owasp_top10":           {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10",
+    "owasp_top10":           {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 (2021)",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
-    "owasp_top10_2021":      {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 2021",
+    "owasp_top10_2021":      {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 (2021)",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
-    "owasp_api_top10_2023":  {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP API Top 10 2023",
+    "owasp_api_top10_2023":  {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP API Security Top 10 (2023)",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
-    "owasp_samm":            {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP SAMM",
+    "owasp_samm":            {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP SAMM 2.0",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
    "owasp_mobile_top10":    {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Mobile Top 10",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
-    "oecd_ai_principles":    {"license": "OECD_PUBLIC",  "rule": 2, "source_type": "standard", "name": "OECD AI Principles",
+    "oecd_ai_principles":    {"license": "OECD_PUBLIC",  "rule": 2, "source_type": "standard", "name": "OECD KI-Empfehlung",
                              "attribution": "OECD"},

    # RULE 3: RESTRICTED — Full reformulation required
@@ -626,6 +626,7 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
        "model": OLLAMA_MODEL,
        "messages": messages,
        "stream": False,
+        "format": "json",
        "options": {"num_predict": 512},  # Limit response length for speed
        "think": False,  # Disable thinking for faster responses
    }
@@ -1040,8 +1041,10 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
        effective_paragraph = llm_paragraph or chunk.paragraph or ""
        control.license_rule = 1
        control.source_original_text = chunk.text
+        # Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
+        canonical_source = license_info.get("name", chunk.regulation_name)
        control.source_citation = {
-            "source": chunk.regulation_name,
+            "source": canonical_source,
            "article": effective_article,
            "paragraph": effective_paragraph,
            "license": license_info.get("license", ""),
@@ -1105,8 +1108,10 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
        effective_paragraph = llm_paragraph or chunk.paragraph or ""
        control.license_rule = 2
        control.source_original_text = chunk.text
+        # Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
+        canonical_source = license_info.get("name", chunk.regulation_name)
        control.source_citation = {
-            "source": chunk.regulation_name,
+            "source": canonical_source,
            "article": effective_article,
            "paragraph": effective_paragraph,
            "license": license_info.get("license", ""),
@@ -1277,8 +1282,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
            effective_paragraph = llm_paragraph or chunk.paragraph or ""
            if lic["rule"] in (1, 2):
                control.source_original_text = chunk.text
+                # Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
+                canonical_source = lic.get("name", chunk.regulation_name)
                control.source_citation = {
-                    "source": chunk.regulation_name,
+                    "source": canonical_source,
                    "article": effective_article,
                    "paragraph": effective_paragraph,
                    "license": lic.get("license", ""),
@@ -46,20 +46,62 @@ ANTHROPIC_API_URL = "https://api.anthropic.com/v1"


 # ---------------------------------------------------------------------------
-# Normative signal detection (Rule 1)
+# Normative signal detection — 3-Tier Classification
 # ---------------------------------------------------------------------------
+# Tier 1: Pflicht (mandatory) — strong normative signals
+# Tier 2: Empfehlung (recommendation) — weaker normative signals
+# Tier 3: Kann (optional/permissive) — permissive signals
+# Nothing is rejected — everything is classified.

-_NORMATIVE_SIGNALS = [
+_PFLICHT_SIGNALS = [
+    # Deutsche modale Pflichtformulierungen
    r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b",
    r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b",
-    r"\bist\s+verpflichtet\b", r"\bist\s+zu\s+\w+en\b",
-    r"\bsind\s+zu\s+\w+en\b", r"\bhat\s+zu\s+\w+en\b",
-    r"\bhaben\s+zu\s+\w+en\b", r"\bsoll\b", r"\bsollen\b",
-    r"\bgewährleisten\b", r"\bsicherstellen\b",
+    r"\bist\s+verpflichtet\b",
+    # "ist zu prüfen", "sind zu dokumentieren" (direkt)
+    r"\bist\s+zu\s+\w+en\b", r"\bsind\s+zu\s+\w+en\b",
+    r"\bhat\s+zu\s+\w+en\b", r"\bhaben\s+zu\s+\w+en\b",
+    # "ist festzustellen", "sind vorzunehmen" (Compound-Verben, eingebettetes zu)
+    r"\bist\s+\w+zu\w+en\b", r"\bsind\s+\w+zu\w+en\b",
+    # "ist zusätzlich zu prüfen", "sind regelmäßig zu überwachen" (Adverb dazwischen)
+    r"\bist\s+\w+\s+zu\s+\w+en\b", r"\bsind\s+\w+\s+zu\s+\w+en\b",
+    r"\bhat\s+\w+\s+zu\s+\w+en\b", r"\bhaben\s+\w+\s+zu\s+\w+en\b",
+    # Englische Pflicht-Signale
    r"\bshall\b", r"\bmust\b", r"\brequired\b",
-    r"\bshould\b", r"\bensure\b",
+    # Compound-Infinitive (Gerundivum): mitzuteilen, anzuwenden, bereitzustellen
+    r"\b\w+zuteilen\b", r"\b\w+zuwenden\b", r"\b\w+zustellen\b", r"\b\w+zulegen\b",
+    r"\b\w+zunehmen\b", r"\b\w+zuführen\b", r"\b\w+zuhalten\b", r"\b\w+zusetzen\b",
+    r"\b\w+zuweisen\b", r"\b\w+zuordnen\b", r"\b\w+zufügen\b", r"\b\w+zugeben\b",
+    # Breites Pattern: "ist ... [bis 80 Zeichen] ... zu + Infinitiv"
+    r"\bist\b.{1,80}\bzu\s+\w+en\b", r"\bsind\b.{1,80}\bzu\s+\w+en\b",
 ]
-_NORMATIVE_RE = re.compile("|".join(_NORMATIVE_SIGNALS), re.IGNORECASE)
+_PFLICHT_RE = re.compile("|".join(_PFLICHT_SIGNALS), re.IGNORECASE)
+
+_EMPFEHLUNG_SIGNALS = [
+    # Modale Verben (schwaecher als "muss")
+    r"\bsoll\b", r"\bsollen\b", r"\bsollte\b", r"\bsollten\b",
+    r"\bgewährleisten\b", r"\bsicherstellen\b",
+    # Englische Empfehlungs-Signale
+    r"\bshould\b", r"\bensure\b", r"\brecommend\w*\b",
+    # Haeufige normative Infinitive (ohne Hilfsverb, als Empfehlung)
+    r"\bnachweisen\b", r"\beinhalten\b", r"\bunterlassen\b", r"\bwahren\b",
+    r"\bdokumentieren\b", r"\bimplementieren\b", r"\büberprüfen\b", r"\büberwachen\b",
+    # Pruefanweisungen als normative Aussage
+    r"\bprüfen,\s+ob\b", r"\bkontrollieren,\s+ob\b",
+]
+_EMPFEHLUNG_RE = re.compile("|".join(_EMPFEHLUNG_SIGNALS), re.IGNORECASE)
+
+_KANN_SIGNALS = [
+    r"\bkann\b", r"\bkönnen\b", r"\bdarf\b", r"\bdürfen\b",
+    r"\bmay\b", r"\boptional\b",
+]
+_KANN_RE = re.compile("|".join(_KANN_SIGNALS), re.IGNORECASE)
+
+# Union of all normative signals (for backward-compatible has_normative_signal flag)
+_NORMATIVE_RE = re.compile(
+    "|".join(_PFLICHT_SIGNALS + _EMPFEHLUNG_SIGNALS + _KANN_SIGNALS),
+    re.IGNORECASE,
+)

 _RATIONALE_SIGNALS = [
    r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung",
@@ -100,6 +142,7 @@ class ObligationCandidate:
    object_: str = ""
    condition: Optional[str] = None
    normative_strength: str = "must"
+    obligation_type: str = "pflicht"  # pflicht | empfehlung | kann
    is_test_obligation: bool = False
    is_reporting_obligation: bool = False
    extraction_confidence: float = 0.0
@@ -115,6 +158,7 @@ class ObligationCandidate:
            "object": self.object_,
            "condition": self.condition,
            "normative_strength": self.normative_strength,
+            "obligation_type": self.obligation_type,
            "is_test_obligation": self.is_test_obligation,
            "is_reporting_obligation": self.is_reporting_obligation,
            "extraction_confidence": self.extraction_confidence,
@@ -162,11 +206,30 @@ class AtomicControlCandidate:
 # ---------------------------------------------------------------------------


+def classify_obligation_type(txt: str) -> str:
+    """Classify obligation text into pflicht/empfehlung/kann.
+
+    Priority: pflicht > empfehlung > kann > empfehlung (default).
+    Nothing is rejected — obligations without normative signal default
+    to 'empfehlung' (recommendation).
+    """
+    if _PFLICHT_RE.search(txt):
+        return "pflicht"
+    if _EMPFEHLUNG_RE.search(txt):
+        return "empfehlung"
+    if _KANN_RE.search(txt):
+        return "kann"
+    # No signal at all — LLM thought it was an obligation, classify
+    # as recommendation (the user can still use it).
+    return "empfehlung"
+
+
 def quality_gate(candidate: ObligationCandidate) -> dict:
    """Validate an obligation candidate. Returns quality flags dict.

    Checks:
-        has_normative_signal: text contains normative language
+        has_normative_signal: text contains normative language (informational)
+        obligation_type: pflicht | empfehlung | kann (classified, never rejected)
        single_action: only one main action (heuristic)
        not_rationale: not just a justification/reasoning
        not_evidence_only: not just an evidence requirement
@@ -176,9 +239,12 @@ def quality_gate(candidate: ObligationCandidate) -> dict:
    txt = candidate.obligation_text
    flags = {}

-    # 1. Normative signal
+    # 1. Normative signal (informational — no longer used for rejection)
    flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(txt))

+    # 1b. Obligation type classification
+    flags["obligation_type"] = classify_obligation_type(txt)
+
    # 2. Single action heuristic — count "und" / "and" / "sowie" splits
    #    that connect different verbs (imperfect but useful)
    multi_verb_re = re.compile(
@@ -210,8 +276,12 @@ def quality_gate(candidate: ObligationCandidate) -> dict:


 def passes_quality_gate(flags: dict) -> bool:
-    """Check if all critical quality flags pass."""
-    critical = ["has_normative_signal", "not_evidence_only", "min_length", "has_parent_link"]
+    """Check if critical quality flags pass.
+
+    Note: has_normative_signal is NO LONGER critical — obligations without
+    normative signal are classified as 'empfehlung' instead of being rejected.
+    """
+    critical = ["not_evidence_only", "min_length", "has_parent_link"]
    return all(flags.get(k, False) for k in critical)


@@ -224,6 +294,13 @@ _PASS0A_SYSTEM_PROMPT = """\
 Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
 in einzelne atomare Pflichten.

+ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!):
+1. Identifiziere den Adressaten (Wer muss handeln?)
+2. Identifiziere die Handlung (Was muss getan werden?)
+3. Bestimme die normative Staerke (muss/soll/kann)
+4. Pruefe ob Test- oder Meldepflicht vorliegt (separat erfassen!)
+5. Formuliere jede Pflicht als eigenstaendiges JSON-Objekt
+
 REGELN (STRIKT EINHALTEN):
 1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
 sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
@@ -272,6 +349,12 @@ _PASS0B_SYSTEM_PROMPT = """\
 Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
 normativen Pflicht ein praxisorientiertes, atomares Security Control.

+ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!):
+1. Identifiziere die konkrete Anforderung aus der Pflicht
+2. Leite eine umsetzbare technische/organisatorische Massnahme ab
+3. Definiere ein Pruefverfahren (wie wird Umsetzung verifiziert?)
+4. Bestimme den Nachweis (welches Dokument/Artefakt belegt Compliance?)
+
 Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
 Antworte NUR als JSON. Keine Erklärungen."""

@@ -603,8 +686,15 @@ class DecompositionPass:
        stats_0b = await decomp.run_pass0b(limit=100)
    """

-    def __init__(self, db: Session):
+    def __init__(self, db: Session, dedup_enabled: bool = False):
        self.db = db
+        self._dedup = None
+        if dedup_enabled:
+            from compliance.services.control_dedup import (
+                ControlDedupChecker, DEDUP_ENABLED,
+            )
+            if DEDUP_ENABLED:
+                self._dedup = ControlDedupChecker(db)

    # -------------------------------------------------------------------
    # Pass 0a: Obligation Extraction
@@ -810,10 +900,11 @@ class DecompositionPass:
            if not cand.is_reporting_obligation and _REPORTING_RE.search(cand.obligation_text):
                cand.is_reporting_obligation = True

-            # Quality gate
+            # Quality gate + obligation type classification
            flags = quality_gate(cand)
            cand.quality_flags = flags
            cand.extraction_confidence = _compute_extraction_confidence(flags)
+            cand.obligation_type = flags.get("obligation_type", "empfehlung")

            if passes_quality_gate(flags):
                cand.release_state = "validated"
@@ -877,6 +968,9 @@ class DecompositionPass:
            "errors": 0,
            "provider": "anthropic" if use_anthropic else "ollama",
            "batch_size": batch_size,
+            "dedup_enabled": self._dedup is not None,
+            "dedup_linked": 0,
+            "dedup_review": 0,
        }

        # Prepare obligation data
@@ -915,7 +1009,7 @@ class DecompositionPass:
                    results_by_id = _parse_json_object(llm_response)
                    for obl in batch:
                        parsed = results_by_id.get(obl["candidate_id"], {})
-                        self._process_pass0b_control(obl, parsed, stats)
+                        await self._process_pass0b_control(obl, parsed, stats)
                elif use_anthropic:
                    obl = batch[0]
                    prompt = _build_pass0b_prompt(
@@ -931,7 +1025,7 @@ class DecompositionPass:
                    )
                    stats["llm_calls"] += 1
                    parsed = _parse_json_object(llm_response)
-                    self._process_pass0b_control(obl, parsed, stats)
+                    await self._process_pass0b_control(obl, parsed, stats)
                else:
                    from compliance.services.obligation_extractor import _llm_ollama
                    obl = batch[0]
@@ -948,7 +1042,7 @@ class DecompositionPass:
                    )
                    stats["llm_calls"] += 1
                    parsed = _parse_json_object(llm_response)
-                    self._process_pass0b_control(obl, parsed, stats)
+                    await self._process_pass0b_control(obl, parsed, stats)

            except Exception as e:
                ids = ", ".join(o["candidate_id"] for o in batch)
@@ -959,10 +1053,16 @@ class DecompositionPass:
        logger.info("Pass 0b: %s", stats)
        return stats

-    def _process_pass0b_control(
+    async def _process_pass0b_control(
        self, obl: dict, parsed: dict, stats: dict,
    ) -> None:
-        """Create atomic control from parsed LLM output or template fallback."""
+        """Create atomic control from parsed LLM output or template fallback.
+
+        If dedup is enabled, checks for duplicates before insertion:
+        - LINK: adds parent link to existing control instead of creating new
+        - REVIEW: queues for human review, does not create control
+        - NEW: creates new control and indexes in Qdrant
+        """
        if not parsed or not parsed.get("title"):
            atomic = _template_fallback(
                obligation_text=obl["obligation_text"],
@@ -990,6 +1090,56 @@ class DecompositionPass:
        atomic.parent_control_uuid = obl["parent_uuid"]
        atomic.obligation_candidate_id = obl["candidate_id"]

+        # ── Dedup check (if enabled) ────────────────────────────
+        if self._dedup:
+            pattern_id = None
+            # Try to get pattern_id from parent control
+            pid_row = self.db.execute(text(
+                "SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)"
+            ), {"uid": obl["parent_uuid"]}).fetchone()
+            if pid_row:
+                pattern_id = pid_row[0]
+
+            result = await self._dedup.check_duplicate(
+                action=obl.get("action", ""),
+                obj=obl.get("object", ""),
+                title=atomic.title,
+                pattern_id=pattern_id,
+            )
+
+            if result.verdict == "link":
+                self._dedup.add_parent_link(
+                    control_uuid=result.matched_control_uuid,
+                    parent_control_uuid=obl["parent_uuid"],
+                    link_type="dedup_merge",
+                    confidence=result.similarity_score,
+                )
+                stats.setdefault("dedup_linked", 0)
+                stats["dedup_linked"] += 1
+                stats["candidates_processed"] += 1
+                logger.info("Dedup LINK: %s → %s (%.3f, %s)",
+                            atomic.title[:60], result.matched_control_id,
+                            result.similarity_score, result.stage)
+                return
+
+            if result.verdict == "review":
+                self._dedup.write_review(
+                    candidate_control_id=atomic.candidate_id or "",
+                    candidate_title=atomic.title,
+                    candidate_objective=atomic.objective,
+                    result=result,
+                    parent_control_uuid=obl["parent_uuid"],
+                    obligation_candidate_id=obl.get("oc_id"),
+                )
+                stats.setdefault("dedup_review", 0)
+                stats["dedup_review"] += 1
+                stats["candidates_processed"] += 1
+                logger.info("Dedup REVIEW: %s ↔ %s (%.3f, %s)",
+                            atomic.title[:60], result.matched_control_id,
+                            result.similarity_score, result.stage)
+                return
+
+        # ── Create new atomic control ───────────────────────────
        seq = self._next_atomic_seq(obl["parent_control_id"])
        atomic.candidate_id = f"{obl['parent_control_id']}-A{seq:02d}"

@@ -1006,6 +1156,29 @@ class DecompositionPass:
            {"oc_id": obl["oc_id"]},
        )

+        # Index in Qdrant for future dedup checks
+        if self._dedup:
+            pattern_id_val = None
+            pid_row2 = self.db.execute(text(
+                "SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)"
+            ), {"uid": obl["parent_uuid"]}).fetchone()
+            if pid_row2:
+                pattern_id_val = pid_row2[0]
+
+            # Get the UUID of the newly inserted control
+            new_row = self.db.execute(text(
+                "SELECT id::text FROM canonical_controls WHERE control_id = :cid ORDER BY created_at DESC LIMIT 1"
+            ), {"cid": atomic.candidate_id}).fetchone()
+            if new_row and pattern_id_val:
+                await self._dedup.index_control(
+                    control_uuid=new_row[0],
+                    control_id=atomic.candidate_id,
+                    title=atomic.title,
+                    action=obl.get("action", ""),
+                    obj=obl.get("object", ""),
+                    pattern_id=pattern_id_val,
+                )
+
        stats["controls_created"] += 1
        stats["candidates_processed"] += 1

@@ -1415,7 +1588,7 @@ class DecompositionPass:
                if pass_type == "0a":
                    self._handle_batch_result_0a(custom_id, text_content, stats)
                else:
-                    self._handle_batch_result_0b(custom_id, text_content, stats)
+                    await self._handle_batch_result_0b(custom_id, text_content, stats)
            except Exception as e:
                logger.error("Processing batch result %s: %s", custom_id, e)
                stats["errors"] += 1
@@ -1466,7 +1639,7 @@ class DecompositionPass:
                self._process_pass0a_obligations(raw_obls, control_id, control_uuid, stats)
                stats["controls_processed"] += 1

-    def _handle_batch_result_0b(
+    async def _handle_batch_result_0b(
        self, custom_id: str, text_content: str, stats: dict,
    ) -> None:
        """Process a single Pass 0b batch result."""
@@ -1477,14 +1650,14 @@ class DecompositionPass:
            parsed = _parse_json_object(text_content)
            obl = self._load_obligation_for_0b(candidate_ids[0])
            if obl:
-                self._process_pass0b_control(obl, parsed, stats)
+                await self._process_pass0b_control(obl, parsed, stats)
        else:
            results_by_id = _parse_json_object(text_content)
            for cand_id in candidate_ids:
                parsed = results_by_id.get(cand_id, {})
                obl = self._load_obligation_for_0b(cand_id)
                if obl:
-                    self._process_pass0b_control(obl, parsed, stats)
+                    await self._process_pass0b_control(obl, parsed, stats)

    def _load_obligation_for_0b(self, candidate_id: str) -> Optional[dict]:
        """Load obligation data needed for Pass 0b processing."""
@@ -524,6 +524,7 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
        "model": OLLAMA_MODEL,
        "messages": messages,
        "stream": False,
+        "format": "json",
        "options": {"num_predict": 512},
        "think": False,
    }
@@ -100,6 +100,40 @@ class ComplianceRAGClient:
            logger.warning("RAG search failed: %s", e)
            return []

+    async def search_with_rerank(
+        self,
+        query: str,
+        collection: str = "bp_compliance_ce",
+        regulations: Optional[List[str]] = None,
+        top_k: int = 5,
+    ) -> List[RAGSearchResult]:
+        """
+        Search with optional cross-encoder re-ranking.
+
+        Fetches top_k*4 results from RAG, then re-ranks with cross-encoder
+        and returns top_k. Falls back to regular search if reranker is disabled.
+        """
+        from .reranker import get_reranker
+
+        reranker = get_reranker()
+        if reranker is None:
+            return await self.search(query, collection, regulations, top_k)
+
+        # Fetch more candidates for re-ranking
+        candidates = await self.search(
+            query, collection, regulations, top_k=max(top_k * 4, 20)
+        )
+        if not candidates:
+            return []
+
+        texts = [c.text for c in candidates]
+        try:
+            ranked_indices = reranker.rerank(query, texts, top_k=top_k)
+            return [candidates[i] for i in ranked_indices]
+        except Exception as e:
+            logger.warning("Reranking failed, returning unranked: %s", e)
+            return candidates[:top_k]
+
    async def scroll(
        self,
        collection: str,
@@ -0,0 +1,85 @@
+"""
+Cross-Encoder Re-Ranking for RAG Search Results.
+
+Uses BGE Reranker v2 (BAAI/bge-reranker-v2-m3, MIT license) to re-rank
+search results from Qdrant for improved retrieval quality.
+
+Lazy-loads the model on first use. Disabled by default (RERANK_ENABLED=false).
+"""
+
+import logging
+import os
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+RERANK_ENABLED = os.getenv("RERANK_ENABLED", "false").lower() == "true"
+RERANK_MODEL = os.getenv("RERANK_MODEL", "BAAI/bge-reranker-v2-m3")
+
+
+class Reranker:
+    """Cross-encoder reranker using sentence-transformers."""
+
+    def __init__(self, model_name: str = RERANK_MODEL):
+        self._model = None  # Lazy init
+        self._model_name = model_name
+
+    def _ensure_model(self) -> None:
+        """Load model on first use."""
+        if self._model is not None:
+            return
+        try:
+            from sentence_transformers import CrossEncoder
+
+            logger.info("Loading reranker model: %s", self._model_name)
+            self._model = CrossEncoder(self._model_name)
+            logger.info("Reranker model loaded successfully")
+        except ImportError:
+            logger.error(
+                "sentence-transformers not installed. "
+                "Install with: pip install sentence-transformers"
+            )
+            raise
+        except Exception as e:
+            logger.error("Failed to load reranker model: %s", e)
+            raise
+
+    def rerank(
+        self, query: str, texts: list[str], top_k: int = 5
+    ) -> list[int]:
+        """
+        Return indices of top_k texts sorted by relevance (highest first).
+
+        Args:
+            query: The search query.
+            texts: List of candidate texts to re-rank.
+            top_k: Number of top results to return.
+
+        Returns:
+            List of indices into the original texts list, sorted by relevance.
+        """
+        if not texts:
+            return []
+
+        self._ensure_model()
+
+        pairs = [[query, text] for text in texts]
+        scores = self._model.predict(pairs)
+
+        # Sort by score descending, return indices
+        ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
+        return ranked[:top_k]
+
+
+# Module-level singleton
+_reranker: Optional[Reranker] = None
+
+
+def get_reranker() -> Optional[Reranker]:
+    """Get the shared reranker instance. Returns None if disabled."""
+    global _reranker
+    if not RERANK_ENABLED:
+        return None
+    if _reranker is None:
+        _reranker = Reranker()
+    return _reranker