feat(control-pipeline): replace similarity-only dedup with LLM-verified dedup in pipeline

Stage 4 (Harmonization) now uses two-tier approach: - Score >= 0.92: auto-duplicate (embedding only, fast) - Score 0.85-0.92: LLM verification via local qwen3.5 (think=false, ~3s) - Score < 0.85: not a duplicate This eliminates ~44% false positives from pure embedding similarity. LLM_DEDUP_ENABLED env var controls the feature (default: true). Also adds 10 applicability use case tests (bank+TAN, webshop+Stripe, SaaS startup, energy provider, health app, automotive, law firm, etc.) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 16:57:37 +02:00
parent bed41dcbdf
commit 1f8667c7da
2 changed files with 558 additions and 16 deletions
--- a/control-pipeline/services/control_generator.py
+++ b/control-pipeline/services/control_generator.py
@@ -51,7 +51,9 @@ OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
 LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))

-HARMONIZATION_THRESHOLD = 0.85  # Cosine similarity above this = duplicate
+HARMONIZATION_THRESHOLD = 0.85  # Cosine similarity above this = candidate for dedup
+HARMONIZATION_AUTO_DUP = 0.92   # Above this = auto-duplicate (no LLM check needed)
+LLM_DEDUP_ENABLED = os.getenv("LLM_DEDUP_ENABLED", "true").lower() == "true"

 # Pipeline version — increment when generation rules change materially.
 # v1: Original (local LLM prefilter, old prompt)
@@ -1589,10 +1591,13 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
    # ── Stage 4: Harmonization ─────────────────────────────────────────

    async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
-        """Check if a new control duplicates existing ones via Qdrant vector search.
+        """Check if a new control duplicates existing ones.

-        Uses the atomic_controls_dedup collection for fast nearest-neighbor lookup
-        instead of pre-loading all embeddings into memory.
+        Two-tier approach:
+          1. Fast: Qdrant embedding similarity (pre-filter)
+          2. Precise: Local LLM verification for borderline matches (0.85-0.92)
+
+        Returns list of similar controls if duplicate, None otherwise.
        """
        new_text = f"{new_control.title} {new_control.objective}"
        new_emb = await _get_embedding(new_text)
@@ -1610,22 +1615,90 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
                        "with_payload": {"include": ["control_id", "title"]},
                    },
                )
-                if resp.status_code == 200:
-                    results = resp.json().get("result", [])
-                    if results:
-                        return [
-                            {
-                                "control_id": r["payload"].get("control_id", ""),
-                                "title": r["payload"].get("title", ""),
-                                "similarity": round(r["score"], 3),
-                            }
-                            for r in results
-                        ]
+                if resp.status_code != 200:
+                    return None
+
+                results = resp.json().get("result", [])
+                if not results:
+                    return None
+
+                best = results[0]
+                best_score = best.get("score", 0.0)
+                best_id = best["payload"].get("control_id", "")
+                best_title = best["payload"].get("title", "")
+
+                # Tier 1: High similarity → auto-duplicate
+                if best_score >= HARMONIZATION_AUTO_DUP:
+                    return [{"control_id": best_id, "title": best_title,
+                             "similarity": round(best_score, 3), "method": "embedding_auto"}]
+
+                # Tier 2: Borderline → LLM verification
+                if LLM_DEDUP_ENABLED and best_score >= HARMONIZATION_THRESHOLD:
+                    is_dup = await self._llm_verify_duplicate(
+                        new_control.title, new_control.objective or "",
+                        best_title, "",
+                    )
+                    if is_dup:
+                        return [{"control_id": best_id, "title": best_title,
+                                 "similarity": round(best_score, 3), "method": "llm_verified"}]
+                    # LLM says different → not a duplicate
+                    return None
+
+                # Below threshold but returned by Qdrant → not a duplicate
+                return None
+
        except Exception as e:
-            logger.warning("Qdrant dedup search failed: %s — skipping harmonization", e)
+            logger.warning("Harmonization check failed: %s — skipping", e)

        return None

+    async def _llm_verify_duplicate(
+        self, title_a: str, obj_a: str, title_b: str, obj_b: str,
+    ) -> bool:
+        """Ask local LLM whether two controls are duplicates.
+
+        Returns True if the LLM classifies them as DUPLIKAT.
+        Uses qwen3.5 with think=false for fast (~3s) responses.
+        """
+        prompt = (
+            f"Control A:\n{title_a}\n{obj_a[:300]}\n\n"
+            f"Control B:\n{title_b}\n{obj_b[:300]}\n\n"
+            f"Sind diese Controls Duplikate?"
+        )
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                resp = await client.post(
+                    f"{OLLAMA_URL}/api/chat",
+                    json={
+                        "model": OLLAMA_MODEL,
+                        "stream": False,
+                        "think": False,
+                        "options": {"num_predict": 200},
+                        "messages": [
+                            {"role": "system", "content": (
+                                "Du bist ein Compliance-Experte. Vergleiche zwei Controls: "
+                                "DUPLIKAT (gleiche Anforderung, nur anders formuliert) oder "
+                                "VERSCHIEDEN (unterschiedlicher Scope/Inhalt). "
+                                "Antworte NUR mit JSON: {\"verdict\": \"DUPLIKAT\" oder \"VERSCHIEDEN\", "
+                                "\"reason\": \"kurze Begruendung\"}"
+                            )},
+                            {"role": "user", "content": prompt},
+                        ],
+                    },
+                )
+            if resp.status_code != 200:
+                return False
+
+            content = resp.json().get("message", {}).get("content", "")
+            parsed = _parse_llm_json(content)
+            if parsed and "DUPLIKAT" in str(parsed.get("verdict", "")).upper():
+                return True
+
+        except Exception as e:
+            logger.warning("LLM dedup verification failed: %s", e)
+
+        return False
+
    async def _preload_embeddings(self, existing: list[dict]):
        """Pre-load embeddings for all existing controls in batches."""
        texts = [f"{ex.get('title', '')} {ex.get('objective', '')}" for ex in existing]