feat(control-pipeline): replace similarity-only dedup with LLM-verified dedup in pipeline

Stage 4 (Harmonization) now uses two-tier approach:
- Score >= 0.92: auto-duplicate (embedding only, fast)
- Score 0.85-0.92: LLM verification via local qwen3.5 (think=false, ~3s)
- Score < 0.85: not a duplicate

This eliminates ~44% false positives from pure embedding similarity.
LLM_DEDUP_ENABLED env var controls the feature (default: true).

Also adds 10 applicability use case tests (bank+TAN, webshop+Stripe,
SaaS startup, energy provider, health app, automotive, law firm, etc.)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-23 16:57:37 +02:00
parent bed41dcbdf
commit 1f8667c7da
2 changed files with 558 additions and 16 deletions

View File

@@ -51,7 +51,9 @@ OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = candidate for dedup
HARMONIZATION_AUTO_DUP = 0.92 # Above this = auto-duplicate (no LLM check needed)
LLM_DEDUP_ENABLED = os.getenv("LLM_DEDUP_ENABLED", "true").lower() == "true"
# Pipeline version — increment when generation rules change materially.
# v1: Original (local LLM prefilter, old prompt)
@@ -1589,10 +1591,13 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
# ── Stage 4: Harmonization ─────────────────────────────────────────
async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
"""Check if a new control duplicates existing ones via Qdrant vector search.
"""Check if a new control duplicates existing ones.
Uses the atomic_controls_dedup collection for fast nearest-neighbor lookup
instead of pre-loading all embeddings into memory.
Two-tier approach:
1. Fast: Qdrant embedding similarity (pre-filter)
2. Precise: Local LLM verification for borderline matches (0.85-0.92)
Returns list of similar controls if duplicate, None otherwise.
"""
new_text = f"{new_control.title} {new_control.objective}"
new_emb = await _get_embedding(new_text)
@@ -1610,22 +1615,90 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
"with_payload": {"include": ["control_id", "title"]},
},
)
if resp.status_code == 200:
results = resp.json().get("result", [])
if results:
return [
{
"control_id": r["payload"].get("control_id", ""),
"title": r["payload"].get("title", ""),
"similarity": round(r["score"], 3),
}
for r in results
]
if resp.status_code != 200:
return None
results = resp.json().get("result", [])
if not results:
return None
best = results[0]
best_score = best.get("score", 0.0)
best_id = best["payload"].get("control_id", "")
best_title = best["payload"].get("title", "")
# Tier 1: High similarity → auto-duplicate
if best_score >= HARMONIZATION_AUTO_DUP:
return [{"control_id": best_id, "title": best_title,
"similarity": round(best_score, 3), "method": "embedding_auto"}]
# Tier 2: Borderline → LLM verification
if LLM_DEDUP_ENABLED and best_score >= HARMONIZATION_THRESHOLD:
is_dup = await self._llm_verify_duplicate(
new_control.title, new_control.objective or "",
best_title, "",
)
if is_dup:
return [{"control_id": best_id, "title": best_title,
"similarity": round(best_score, 3), "method": "llm_verified"}]
# LLM says different → not a duplicate
return None
# Below threshold but returned by Qdrant → not a duplicate
return None
except Exception as e:
logger.warning("Qdrant dedup search failed: %s — skipping harmonization", e)
logger.warning("Harmonization check failed: %s — skipping", e)
return None
async def _llm_verify_duplicate(
self, title_a: str, obj_a: str, title_b: str, obj_b: str,
) -> bool:
"""Ask local LLM whether two controls are duplicates.
Returns True if the LLM classifies them as DUPLIKAT.
Uses qwen3.5 with think=false for fast (~3s) responses.
"""
prompt = (
f"Control A:\n{title_a}\n{obj_a[:300]}\n\n"
f"Control B:\n{title_b}\n{obj_b[:300]}\n\n"
f"Sind diese Controls Duplikate?"
)
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
f"{OLLAMA_URL}/api/chat",
json={
"model": OLLAMA_MODEL,
"stream": False,
"think": False,
"options": {"num_predict": 200},
"messages": [
{"role": "system", "content": (
"Du bist ein Compliance-Experte. Vergleiche zwei Controls: "
"DUPLIKAT (gleiche Anforderung, nur anders formuliert) oder "
"VERSCHIEDEN (unterschiedlicher Scope/Inhalt). "
"Antworte NUR mit JSON: {\"verdict\": \"DUPLIKAT\" oder \"VERSCHIEDEN\", "
"\"reason\": \"kurze Begruendung\"}"
)},
{"role": "user", "content": prompt},
],
},
)
if resp.status_code != 200:
return False
content = resp.json().get("message", {}).get("content", "")
parsed = _parse_llm_json(content)
if parsed and "DUPLIKAT" in str(parsed.get("verdict", "")).upper():
return True
except Exception as e:
logger.warning("LLM dedup verification failed: %s", e)
return False
async def _preload_embeddings(self, existing: list[dict]):
"""Pre-load embeddings for all existing controls in batches."""
texts = [f"{ex.get('title', '')} {ex.get('objective', '')}" for ex in existing]