"""Control Deduplication Engine — 4-Stage Matching Pipeline. Prevents duplicate atomic controls during Pass 0b by checking candidates against existing controls before insertion. Stages: 1. Pattern-Gate: pattern_id must match (hard gate) 2. Action-Check: normalized action verb must match (hard gate) 3. Object-Norm: normalized object must match (soft gate with high threshold) 4. Embedding: cosine similarity with tiered thresholds (Qdrant) Verdicts: - NEW: create a new atomic control - LINK: add parent link to existing control (similarity > LINK_THRESHOLD) - REVIEW: queue for human review (REVIEW_THRESHOLD < sim < LINK_THRESHOLD) """ import logging import os import re from dataclasses import dataclass, field from typing import Optional, Callable, Awaitable import httpx logger = logging.getLogger(__name__) # ── Configuration ──────────────────────────────────────────────────── DEDUP_ENABLED = os.getenv("DEDUP_ENABLED", "true").lower() == "true" LINK_THRESHOLD = float(os.getenv("DEDUP_LINK_THRESHOLD", "0.92")) REVIEW_THRESHOLD = float(os.getenv("DEDUP_REVIEW_THRESHOLD", "0.85")) LINK_THRESHOLD_DIFF_OBJECT = float(os.getenv("DEDUP_LINK_THRESHOLD_DIFF_OBJ", "0.95")) CROSS_REG_LINK_THRESHOLD = float(os.getenv("DEDUP_CROSS_REG_THRESHOLD", "0.95")) QDRANT_COLLECTION = os.getenv("DEDUP_QDRANT_COLLECTION", "atomic_controls") QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333") EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087") # ── Result Dataclass ───────────────────────────────────────────────── @dataclass class DedupResult: """Outcome of the dedup check.""" verdict: str # "new" | "link" | "review" matched_control_uuid: Optional[str] = None matched_control_id: Optional[str] = None matched_title: Optional[str] = None stage: str = "" # which stage decided similarity_score: float = 0.0 link_type: str = "dedup_merge" # "dedup_merge" | "cross_regulation" details: dict = field(default_factory=dict) # ── Action Normalization ───────────────────────────────────────────── _ACTION_SYNONYMS: dict[str, str] = { # German → canonical English "implementieren": "implement", "umsetzen": "implement", "einrichten": "implement", "einführen": "implement", "aufbauen": "implement", "bereitstellen": "implement", "aktivieren": "implement", "konfigurieren": "configure", "einstellen": "configure", "parametrieren": "configure", "testen": "test", "prüfen": "test", "überprüfen": "test", "verifizieren": "test", "validieren": "test", "kontrollieren": "test", "auditieren": "audit", "dokumentieren": "document", "protokollieren": "log", "aufzeichnen": "log", "loggen": "log", "überwachen": "monitor", "monitoring": "monitor", "beobachten": "monitor", "schulen": "train", "trainieren": "train", "sensibilisieren": "train", "löschen": "delete", "entfernen": "delete", "verschlüsseln": "encrypt", "sperren": "block", "beschränken": "restrict", "einschränken": "restrict", "begrenzen": "restrict", "autorisieren": "authorize", "genehmigen": "authorize", "freigeben": "authorize", "authentifizieren": "authenticate", "identifizieren": "identify", "melden": "report", "benachrichtigen": "notify", "informieren": "notify", "aktualisieren": "update", "erneuern": "update", "sichern": "backup", "wiederherstellen": "restore", # English passthrough "implement": "implement", "configure": "configure", "test": "test", "verify": "test", "validate": "test", "audit": "audit", "document": "document", "log": "log", "monitor": "monitor", "train": "train", "delete": "delete", "encrypt": "encrypt", "restrict": "restrict", "authorize": "authorize", "authenticate": "authenticate", "report": "report", "update": "update", "backup": "backup", "restore": "restore", } def normalize_action(action: str) -> str: """Normalize an action verb to a canonical English form.""" if not action: return "" action = action.strip().lower() # Strip German infinitive/conjugation suffixes for lookup action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action) # Try exact match first, then base form if action in _ACTION_SYNONYMS: return _ACTION_SYNONYMS[action] if action_base in _ACTION_SYNONYMS: return _ACTION_SYNONYMS[action_base] # Fuzzy: check if action starts with any known verb for verb, canonical in _ACTION_SYNONYMS.items(): if action.startswith(verb) or verb.startswith(action): return canonical return action # fallback: return as-is # ── Object Normalization ───────────────────────────────────────────── _OBJECT_SYNONYMS: dict[str, str] = { # Authentication / Access "mfa": "multi_factor_auth", "multi-faktor-authentifizierung": "multi_factor_auth", "mehrfaktorauthentifizierung": "multi_factor_auth", "multi-factor authentication": "multi_factor_auth", "two-factor": "multi_factor_auth", "2fa": "multi_factor_auth", "passwort": "password_policy", "kennwort": "password_policy", "password": "password_policy", "zugangsdaten": "credentials", "credentials": "credentials", "admin-konten": "privileged_access", "admin accounts": "privileged_access", "administratorkonten": "privileged_access", "privilegierte zugriffe": "privileged_access", "privileged accounts": "privileged_access", "remote-zugriff": "remote_access", "fernzugriff": "remote_access", "remote access": "remote_access", "session": "session_management", "sitzung": "session_management", "sitzungsverwaltung": "session_management", # Encryption "verschlüsselung": "encryption", "encryption": "encryption", "kryptografie": "encryption", "kryptografische verfahren": "encryption", "schlüssel": "key_management", "key management": "key_management", "schlüsselverwaltung": "key_management", "zertifikat": "certificate_management", "certificate": "certificate_management", "tls": "transport_encryption", "ssl": "transport_encryption", "https": "transport_encryption", # Network "firewall": "firewall", "netzwerk": "network_security", "network": "network_security", "vpn": "vpn", "segmentierung": "network_segmentation", "segmentation": "network_segmentation", # Logging / Monitoring "audit-log": "audit_logging", "audit log": "audit_logging", "protokoll": "audit_logging", "logging": "audit_logging", "monitoring": "monitoring", "überwachung": "monitoring", "alerting": "alerting", "alarmierung": "alerting", "siem": "siem", # Data "personenbezogene daten": "personal_data", "personal data": "personal_data", "sensible daten": "sensitive_data", "sensitive data": "sensitive_data", "datensicherung": "backup", "backup": "backup", "wiederherstellung": "disaster_recovery", "disaster recovery": "disaster_recovery", # Policy / Process "richtlinie": "policy", "policy": "policy", "verfahrensanweisung": "procedure", "procedure": "procedure", "prozess": "process", "schulung": "training", "training": "training", "awareness": "awareness", "sensibilisierung": "awareness", # Incident "vorfall": "incident", "incident": "incident", "sicherheitsvorfall": "security_incident", "security incident": "security_incident", # Vulnerability "schwachstelle": "vulnerability", "vulnerability": "vulnerability", "patch": "patch_management", "update": "patch_management", "patching": "patch_management", } # Precompile for substring matching (longest first) _OBJECT_KEYS_SORTED = sorted(_OBJECT_SYNONYMS.keys(), key=len, reverse=True) def normalize_object(obj: str) -> str: """Normalize a compliance object to a canonical token.""" if not obj: return "" obj_lower = obj.strip().lower() # Exact match if obj_lower in _OBJECT_SYNONYMS: return _OBJECT_SYNONYMS[obj_lower] # Substring match (longest first) for phrase in _OBJECT_KEYS_SORTED: if phrase in obj_lower: return _OBJECT_SYNONYMS[phrase] # Fallback: strip articles/prepositions, join with underscore cleaned = re.sub(r"\b(der|die|das|den|dem|des|ein|eine|eines|einem|einen" r"|für|von|zu|auf|in|an|bei|mit|nach|über|unter|the|a|an" r"|for|of|to|on|in|at|by|with)\b", "", obj_lower) tokens = [t for t in cleaned.split() if len(t) > 2] return "_".join(tokens[:4]) if tokens else obj_lower.replace(" ", "_") # ── Canonicalization ───────────────────────────────────────────────── def canonicalize_text(action: str, obj: str, title: str = "") -> str: """Build a canonical English text for embedding. Transforms German compliance text into normalized English tokens for more stable embedding comparisons. """ norm_action = normalize_action(action) norm_object = normalize_object(obj) # Build canonical sentence parts = [norm_action, norm_object] if title: # Add title keywords (stripped of common filler) title_clean = re.sub( r"\b(und|oder|für|von|zu|der|die|das|den|dem|des|ein|eine" r"|bei|mit|nach|gemäß|gem\.|laut|entsprechend)\b", "", title.lower() ) title_tokens = [t for t in title_clean.split() if len(t) > 3][:5] if title_tokens: parts.append("for") parts.extend(title_tokens) return " ".join(parts) # ── Embedding Helper ───────────────────────────────────────────────── async def get_embedding(text: str) -> list[float]: """Get embedding vector for a single text via embedding service.""" try: async with httpx.AsyncClient(timeout=10.0) as client: resp = await client.post( f"{EMBEDDING_URL}/embed", json={"texts": [text]}, ) embeddings = resp.json().get("embeddings", []) return embeddings[0] if embeddings else [] except Exception as e: logger.warning("Embedding failed: %s", e) return [] def cosine_similarity(a: list[float], b: list[float]) -> float: """Compute cosine similarity between two vectors.""" if not a or not b or len(a) != len(b): return 0.0 dot = sum(x * y for x, y in zip(a, b)) norm_a = sum(x * x for x in a) ** 0.5 norm_b = sum(x * x for x in b) ** 0.5 if norm_a == 0 or norm_b == 0: return 0.0 return dot / (norm_a * norm_b) # ── Qdrant Helpers ─────────────────────────────────────────────────── async def qdrant_search( embedding: list[float], pattern_id: str, top_k: int = 10, collection: Optional[str] = None, ) -> list[dict]: """Search Qdrant for similar atomic controls, filtered by pattern_id.""" if not embedding: return [] coll = collection or QDRANT_COLLECTION body: dict = { "vector": embedding, "limit": top_k, "with_payload": True, "filter": { "must": [ {"key": "pattern_id", "match": {"value": pattern_id}} ] }, } try: async with httpx.AsyncClient(timeout=10.0) as client: resp = await client.post( f"{QDRANT_URL}/collections/{coll}/points/search", json=body, ) if resp.status_code != 200: logger.warning("Qdrant search failed: %d", resp.status_code) return [] return resp.json().get("result", []) except Exception as e: logger.warning("Qdrant search error: %s", e) return [] async def qdrant_search_cross_regulation( embedding: list[float], top_k: int = 5, collection: Optional[str] = None, ) -> list[dict]: """Search Qdrant for similar controls across ALL regulations (no pattern_id filter). Used for cross-regulation linking (e.g. DSGVO Art. 25 ↔ NIS2 Art. 21). """ if not embedding: return [] coll = collection or QDRANT_COLLECTION body: dict = { "vector": embedding, "limit": top_k, "with_payload": True, } try: async with httpx.AsyncClient(timeout=10.0) as client: resp = await client.post( f"{QDRANT_URL}/collections/{coll}/points/search", json=body, ) if resp.status_code != 200: logger.warning("Qdrant cross-reg search failed: %d", resp.status_code) return [] return resp.json().get("result", []) except Exception as e: logger.warning("Qdrant cross-reg search error: %s", e) return [] async def qdrant_upsert( point_id: str, embedding: list[float], payload: dict, collection: Optional[str] = None, ) -> bool: """Upsert a single point into a Qdrant collection.""" if not embedding: return False coll = collection or QDRANT_COLLECTION body = { "points": [{ "id": point_id, "vector": embedding, "payload": payload, }] } try: async with httpx.AsyncClient(timeout=10.0) as client: resp = await client.put( f"{QDRANT_URL}/collections/{coll}/points", json=body, ) return resp.status_code == 200 except Exception as e: logger.warning("Qdrant upsert error: %s", e) return False async def ensure_qdrant_collection( vector_size: int = 1024, collection: Optional[str] = None, ) -> bool: """Create a Qdrant collection if it doesn't exist (idempotent).""" coll = collection or QDRANT_COLLECTION try: async with httpx.AsyncClient(timeout=10.0) as client: # Check if exists resp = await client.get(f"{QDRANT_URL}/collections/{coll}") if resp.status_code == 200: return True # Create resp = await client.put( f"{QDRANT_URL}/collections/{coll}", json={ "vectors": {"size": vector_size, "distance": "Cosine"}, }, ) if resp.status_code == 200: logger.info("Created Qdrant collection: %s", coll) # Create payload indexes for field_name in ["pattern_id", "action_normalized", "object_normalized", "control_id"]: await client.put( f"{QDRANT_URL}/collections/{coll}/index", json={"field_name": field_name, "field_schema": "keyword"}, ) return True logger.error("Failed to create Qdrant collection: %d", resp.status_code) return False except Exception as e: logger.warning("Qdrant collection check error: %s", e) return False # ── Main Dedup Checker ─────────────────────────────────────────────── class ControlDedupChecker: """4-stage dedup checker for atomic controls. Usage: checker = ControlDedupChecker(db_session) result = await checker.check_duplicate(candidate_action, candidate_object, candidate_title, pattern_id) if result.verdict == "link": checker.add_parent_link(result.matched_control_uuid, parent_uuid) elif result.verdict == "review": checker.write_review(candidate, result) else: # Insert new control """ def __init__( self, db, embed_fn: Optional[Callable[[str], Awaitable[list[float]]]] = None, search_fn: Optional[Callable] = None, ): self.db = db self._embed = embed_fn or get_embedding self._search = search_fn or qdrant_search self._cache: dict[str, list[dict]] = {} # pattern_id → existing controls def _load_existing(self, pattern_id: str) -> list[dict]: """Load existing atomic controls with same pattern_id from DB.""" if pattern_id in self._cache: return self._cache[pattern_id] from sqlalchemy import text rows = self.db.execute(text(""" SELECT id::text, control_id, title, objective, pattern_id, generation_metadata->>'obligation_type' as obligation_type FROM canonical_controls WHERE parent_control_uuid IS NOT NULL AND release_state != 'deprecated' AND pattern_id = :pid """), {"pid": pattern_id}).fetchall() result = [ { "uuid": r[0], "control_id": r[1], "title": r[2], "objective": r[3], "pattern_id": r[4], "obligation_type": r[5], } for r in rows ] self._cache[pattern_id] = result return result async def check_duplicate( self, action: str, obj: str, title: str, pattern_id: Optional[str], ) -> DedupResult: """Run the 4-stage dedup pipeline + cross-regulation linking. Returns DedupResult with verdict: new/link/review. """ # No pattern_id → can't dedup meaningfully if not pattern_id: return DedupResult(verdict="new", stage="no_pattern") # Stage 1: Pattern-Gate existing = self._load_existing(pattern_id) if not existing: return DedupResult( verdict="new", stage="pattern_gate", details={"reason": "no existing controls with this pattern_id"}, ) # Stage 2: Action-Check norm_action = normalize_action(action) # We don't have action stored on existing controls from DB directly, # so we use embedding for controls that passed pattern gate. # But we CAN check via generation_metadata if available. # Stage 3: Object-Normalization norm_object = normalize_object(obj) # Stage 4: Embedding Similarity canonical = canonicalize_text(action, obj, title) embedding = await self._embed(canonical) if not embedding: # Can't compute embedding → default to new return DedupResult( verdict="new", stage="embedding_unavailable", details={"canonical_text": canonical}, ) # Search Qdrant results = await self._search(embedding, pattern_id, top_k=5) if not results: # No intra-pattern matches → try cross-regulation return await self._check_cross_regulation(embedding, DedupResult( verdict="new", stage="no_qdrant_matches", details={"canonical_text": canonical, "action": norm_action, "object": norm_object}, )) # Evaluate best match best = results[0] best_score = best.get("score", 0.0) best_payload = best.get("payload", {}) best_action = best_payload.get("action_normalized", "") best_object = best_payload.get("object_normalized", "") # Action differs → NEW (even if embedding is high) if best_action and norm_action and best_action != norm_action: return await self._check_cross_regulation(embedding, DedupResult( verdict="new", stage="action_mismatch", similarity_score=best_score, matched_control_id=best_payload.get("control_id"), details={ "candidate_action": norm_action, "existing_action": best_action, "similarity": best_score, }, )) # Object differs → use higher threshold if best_object and norm_object and best_object != norm_object: if best_score > LINK_THRESHOLD_DIFF_OBJECT: return DedupResult( verdict="link", stage="embedding_diff_object", matched_control_uuid=best_payload.get("control_uuid"), matched_control_id=best_payload.get("control_id"), matched_title=best_payload.get("title"), similarity_score=best_score, details={"candidate_object": norm_object, "existing_object": best_object}, ) return await self._check_cross_regulation(embedding, DedupResult( verdict="new", stage="object_mismatch_below_threshold", similarity_score=best_score, matched_control_id=best_payload.get("control_id"), details={ "candidate_object": norm_object, "existing_object": best_object, "threshold": LINK_THRESHOLD_DIFF_OBJECT, }, )) # Same action + same object → tiered thresholds if best_score > LINK_THRESHOLD: return DedupResult( verdict="link", stage="embedding_match", matched_control_uuid=best_payload.get("control_uuid"), matched_control_id=best_payload.get("control_id"), matched_title=best_payload.get("title"), similarity_score=best_score, ) if best_score > REVIEW_THRESHOLD: return DedupResult( verdict="review", stage="embedding_review", matched_control_uuid=best_payload.get("control_uuid"), matched_control_id=best_payload.get("control_id"), matched_title=best_payload.get("title"), similarity_score=best_score, ) return await self._check_cross_regulation(embedding, DedupResult( verdict="new", stage="embedding_below_threshold", similarity_score=best_score, details={"threshold": REVIEW_THRESHOLD}, )) async def _check_cross_regulation( self, embedding: list[float], intra_result: DedupResult, ) -> DedupResult: """Second pass: cross-regulation linking for controls deemed 'new'. Searches Qdrant WITHOUT pattern_id filter. Uses a higher threshold (0.95) to avoid false positives across regulation boundaries. """ if intra_result.verdict != "new" or not embedding: return intra_result cross_results = await qdrant_search_cross_regulation(embedding, top_k=5) if not cross_results: return intra_result best = cross_results[0] best_score = best.get("score", 0.0) if best_score > CROSS_REG_LINK_THRESHOLD: best_payload = best.get("payload", {}) return DedupResult( verdict="link", stage="cross_regulation", matched_control_uuid=best_payload.get("control_uuid"), matched_control_id=best_payload.get("control_id"), matched_title=best_payload.get("title"), similarity_score=best_score, link_type="cross_regulation", details={ "cross_reg_score": best_score, "cross_reg_threshold": CROSS_REG_LINK_THRESHOLD, }, ) return intra_result def add_parent_link( self, control_uuid: str, parent_control_uuid: str, link_type: str = "dedup_merge", confidence: float = 0.0, source_regulation: Optional[str] = None, source_article: Optional[str] = None, obligation_candidate_id: Optional[str] = None, ) -> None: """Add a parent link to an existing atomic control.""" from sqlalchemy import text self.db.execute(text(""" INSERT INTO control_parent_links (control_uuid, parent_control_uuid, link_type, confidence, source_regulation, source_article, obligation_candidate_id) VALUES (:cu, :pu, :lt, :conf, :sr, :sa, :oci::uuid) ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING """), { "cu": control_uuid, "pu": parent_control_uuid, "lt": link_type, "conf": confidence, "sr": source_regulation, "sa": source_article, "oci": obligation_candidate_id, }) self.db.commit() def write_review( self, candidate_control_id: str, candidate_title: str, candidate_objective: str, result: DedupResult, parent_control_uuid: Optional[str] = None, obligation_candidate_id: Optional[str] = None, ) -> None: """Write a dedup review queue entry.""" from sqlalchemy import text self.db.execute(text(""" INSERT INTO control_dedup_reviews (candidate_control_id, candidate_title, candidate_objective, matched_control_uuid, matched_control_id, similarity_score, dedup_stage, dedup_details, parent_control_uuid, obligation_candidate_id) VALUES (:ccid, :ct, :co, :mcu::uuid, :mci, :ss, :ds, :dd::jsonb, :pcu::uuid, :oci) """), { "ccid": candidate_control_id, "ct": candidate_title, "co": candidate_objective, "mcu": result.matched_control_uuid, "mci": result.matched_control_id, "ss": result.similarity_score, "ds": result.stage, "dd": __import__("json").dumps(result.details), "pcu": parent_control_uuid, "oci": obligation_candidate_id, }) self.db.commit() async def index_control( self, control_uuid: str, control_id: str, title: str, action: str, obj: str, pattern_id: str, collection: Optional[str] = None, ) -> bool: """Index a new atomic control in Qdrant for future dedup checks.""" norm_action = normalize_action(action) norm_object = normalize_object(obj) canonical = canonicalize_text(action, obj, title) embedding = await self._embed(canonical) if not embedding: return False return await qdrant_upsert( point_id=control_uuid, embedding=embedding, payload={ "control_uuid": control_uuid, "control_id": control_id, "title": title, "pattern_id": pattern_id, "action_normalized": norm_action, "object_normalized": norm_object, "canonical_text": canonical, }, collection=collection, )