"""Batch Dedup Runner — Orchestrates deduplication of ~85k atomare Controls. Reduces Pass 0b controls from ~85k to ~18-25k unique Master Controls via: Phase 1: Intra-Group Dedup — same merge_group_hint → pick best, link rest (85k → ~52k, mostly title-identical short-circuit, no embeddings) Phase 2: Cross-Group Dedup — embed masters, search Qdrant for similar masters with different hints (52k → ~18-25k) All Pass 0b controls have pattern_id=NULL. The primary grouping key is merge_group_hint (format: "action_type:norm_obj:trigger_key"), which encodes the normalized action, object, and trigger. Usage: runner = BatchDedupRunner(db) stats = await runner.run(dry_run=True) # preview stats = await runner.run(dry_run=False) # execute stats = await runner.run(hint_filter="implement:multi_factor_auth:none") """ import json import logging import time from collections import defaultdict from sqlalchemy import text from services.control_dedup import ( canonicalize_text, ensure_qdrant_collection, get_embedding, normalize_action, normalize_object, qdrant_search_cross_regulation, qdrant_upsert, LINK_THRESHOLD, REVIEW_THRESHOLD, ) logger = logging.getLogger(__name__) DEDUP_COLLECTION = "atomic_controls_dedup" # ── Quality Score ──────────────────────────────────────────────────────── def quality_score(control: dict) -> float: """Score a control by richness of requirements, tests, evidence, and objective. Higher score = better candidate for master control. """ score = 0.0 reqs = control.get("requirements") or "[]" if isinstance(reqs, str): try: reqs = json.loads(reqs) except (json.JSONDecodeError, TypeError): reqs = [] score += len(reqs) * 2.0 tests = control.get("test_procedure") or "[]" if isinstance(tests, str): try: tests = json.loads(tests) except (json.JSONDecodeError, TypeError): tests = [] score += len(tests) * 1.5 evidence = control.get("evidence") or "[]" if isinstance(evidence, str): try: evidence = json.loads(evidence) except (json.JSONDecodeError, TypeError): evidence = [] score += len(evidence) * 1.0 objective = control.get("objective") or "" score += min(len(objective) / 200, 3.0) return score # ── Batch Dedup Runner ─────────────────────────────────────────────────── class BatchDedupRunner: """Batch dedup orchestrator for existing Pass 0b atomic controls.""" def __init__(self, db, collection: str = DEDUP_COLLECTION): self.db = db self.collection = collection self.stats = { "total_controls": 0, "unique_hints": 0, "phase1_groups_processed": 0, "masters": 0, "linked": 0, "review": 0, "new_controls": 0, "parent_links_transferred": 0, "cross_group_linked": 0, "cross_group_review": 0, "errors": 0, "skipped_title_identical": 0, } self._progress_phase = "" self._progress_count = 0 self._progress_total = 0 async def run( self, dry_run: bool = False, hint_filter: str = None, ) -> dict: """Run the full batch dedup pipeline. Args: dry_run: If True, compute stats but don't modify DB/Qdrant. hint_filter: If set, only process groups matching this hint prefix. Returns: Stats dict with counts. """ start = time.monotonic() logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)", dry_run, hint_filter) if not dry_run: await ensure_qdrant_collection(collection=self.collection) # Phase 1: Intra-group dedup (same merge_group_hint) self._progress_phase = "phase1" groups = self._load_merge_groups(hint_filter) self._progress_total = self.stats["total_controls"] for hint, controls in groups: try: await self._process_hint_group(hint, controls, dry_run) self.stats["phase1_groups_processed"] += 1 except Exception as e: logger.error("BatchDedup Phase 1 error on hint %s: %s", hint, e) self.stats["errors"] += 1 try: self.db.rollback() except Exception: pass logger.info( "BatchDedup Phase 1 done: %d masters, %d linked, %d review", self.stats["masters"], self.stats["linked"], self.stats["review"], ) # Phase 2: Cross-group dedup via embeddings if not dry_run: self._progress_phase = "phase2" await self._run_cross_group_pass() elapsed = time.monotonic() - start self.stats["elapsed_seconds"] = round(elapsed, 1) logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats) return self.stats def _load_merge_groups(self, hint_filter: str = None) -> list: """Load all Pass 0b controls grouped by merge_group_hint, largest first.""" conditions = [ "decomposition_method = 'pass0b'", "release_state != 'deprecated'", "release_state != 'duplicate'", ] params = {} if hint_filter: conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf") params["hf"] = f"{hint_filter}%" where = " AND ".join(conditions) rows = self.db.execute(text(f""" SELECT id::text, control_id, title, objective, pattern_id, requirements::text, test_procedure::text, evidence::text, release_state, generation_metadata->>'merge_group_hint' as merge_group_hint, generation_metadata->>'action_object_class' as action_object_class FROM canonical_controls WHERE {where} ORDER BY control_id """), params).fetchall() by_hint = defaultdict(list) for r in rows: by_hint[r[9] or ""].append({ "uuid": r[0], "control_id": r[1], "title": r[2], "objective": r[3], "pattern_id": r[4], "requirements": r[5], "test_procedure": r[6], "evidence": r[7], "release_state": r[8], "merge_group_hint": r[9] or "", "action_object_class": r[10] or "", }) self.stats["total_controls"] = len(rows) self.stats["unique_hints"] = len(by_hint) sorted_groups = sorted(by_hint.items(), key=lambda x: len(x[1]), reverse=True) logger.info("BatchDedup loaded %d controls in %d hint groups", len(rows), len(sorted_groups)) return sorted_groups def _sub_group_by_merge_hint(self, controls: list) -> dict: """Group controls by merge_group_hint composite key.""" groups = defaultdict(list) for c in controls: hint = c["merge_group_hint"] if hint: groups[hint].append(c) else: groups[f"__no_hint_{c['uuid']}"].append(c) return dict(groups) async def _process_hint_group( self, hint: str, controls: list, dry_run: bool, ): """Process all controls sharing the same merge_group_hint. Within a hint group, all controls share action+object+trigger. The best-quality control becomes master, rest are linked as duplicates. """ if len(controls) < 2: # Singleton → always master self.stats["masters"] += 1 if not dry_run: await self._embed_and_index(controls[0]) self._progress_count += 1 self._log_progress(hint) return # Sort by quality score (best first) sorted_group = sorted(controls, key=quality_score, reverse=True) master = sorted_group[0] self.stats["masters"] += 1 if not dry_run: await self._embed_and_index(master) for candidate in sorted_group[1:]: # All share the same hint → check title similarity if candidate["title"].strip().lower() == master["title"].strip().lower(): # Identical title → direct link (no embedding needed) self.stats["linked"] += 1 self.stats["skipped_title_identical"] += 1 if not dry_run: await self._mark_duplicate(master, candidate, confidence=1.0) else: # Different title within same hint → still likely duplicate # Use embedding to verify await self._check_and_link_within_group(master, candidate, dry_run) self._progress_count += 1 self._log_progress(hint) async def _check_and_link_within_group( self, master: dict, candidate: dict, dry_run: bool, ): """Check if candidate (same hint group) is duplicate of master via embedding.""" parts = candidate["merge_group_hint"].split(":", 2) action = parts[0] if len(parts) > 0 else "" obj = parts[1] if len(parts) > 1 else "" canonical = canonicalize_text(action, obj, candidate["title"]) embedding = await get_embedding(canonical) if not embedding: # Can't embed → link anyway (same hint = same action+object) self.stats["linked"] += 1 if not dry_run: await self._mark_duplicate(master, candidate, confidence=0.90) return # Search the dedup collection (unfiltered — pattern_id is NULL) results = await qdrant_search_cross_regulation( embedding, top_k=3, collection=self.collection, ) if not results: # No Qdrant matches yet (master might not be indexed yet) → link to master self.stats["linked"] += 1 if not dry_run: await self._mark_duplicate(master, candidate, confidence=0.90) return best = results[0] best_score = best.get("score", 0.0) best_payload = best.get("payload", {}) best_uuid = best_payload.get("control_uuid", "") if best_score > LINK_THRESHOLD: self.stats["linked"] += 1 if not dry_run: await self._mark_duplicate_to(best_uuid, candidate, confidence=best_score) elif best_score > REVIEW_THRESHOLD: self.stats["review"] += 1 if not dry_run: self._write_review(candidate, best_payload, best_score) else: # Very different despite same hint → new master self.stats["new_controls"] += 1 if not dry_run: await self._index_with_embedding(candidate, embedding) async def _run_cross_group_pass(self): """Phase 2: Find cross-group duplicates among surviving masters. After Phase 1, ~52k masters remain. Many have similar semantics despite different merge_group_hints (e.g. different German spellings). This pass embeds all masters and finds near-duplicates via Qdrant. """ logger.info("BatchDedup Phase 2: Cross-group pass starting...") rows = self.db.execute(text(""" SELECT id::text, control_id, title, generation_metadata->>'merge_group_hint' as merge_group_hint FROM canonical_controls WHERE decomposition_method = 'pass0b' AND release_state != 'duplicate' AND release_state != 'deprecated' ORDER BY control_id """)).fetchall() self._progress_total = len(rows) self._progress_count = 0 logger.info("BatchDedup Cross-group: %d masters to check", len(rows)) cross_linked = 0 cross_review = 0 for i, r in enumerate(rows): uuid = r[0] hint = r[3] or "" parts = hint.split(":", 2) action = parts[0] if len(parts) > 0 else "" obj = parts[1] if len(parts) > 1 else "" canonical = canonicalize_text(action, obj, r[2]) embedding = await get_embedding(canonical) if not embedding: continue results = await qdrant_search_cross_regulation( embedding, top_k=5, collection=self.collection, ) if not results: continue # Find best match from a DIFFERENT hint group for match in results: match_score = match.get("score", 0.0) match_payload = match.get("payload", {}) match_uuid = match_payload.get("control_uuid", "") # Skip self-match if match_uuid == uuid: continue # Must be a different hint group (otherwise already handled in Phase 1) match_action = match_payload.get("action_normalized", "") match_object = match_payload.get("object_normalized", "") # Simple check: different control UUID is enough if match_score > LINK_THRESHOLD: # Mark the worse one as duplicate try: self.db.execute(text(""" UPDATE canonical_controls SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid) WHERE id = CAST(:dup AS uuid) AND release_state != 'duplicate' """), {"master": match_uuid, "dup": uuid}) self.db.execute(text(""" INSERT INTO control_parent_links (control_uuid, parent_control_uuid, link_type, confidence) VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf) ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING """), {"cu": match_uuid, "pu": uuid, "conf": match_score}) # Transfer parent links transferred = self._transfer_parent_links(match_uuid, uuid) self.stats["parent_links_transferred"] += transferred self.db.commit() cross_linked += 1 except Exception as e: logger.error("BatchDedup cross-group link error %s→%s: %s", uuid, match_uuid, e) self.db.rollback() self.stats["errors"] += 1 break # Only one cross-link per control elif match_score > REVIEW_THRESHOLD: self._write_review( {"control_id": r[1], "title": r[2], "objective": "", "merge_group_hint": hint, "pattern_id": None}, match_payload, match_score, ) cross_review += 1 break self._progress_count = i + 1 if (i + 1) % 500 == 0: logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review", i + 1, len(rows), cross_linked, cross_review) self.stats["cross_group_linked"] = cross_linked self.stats["cross_group_review"] = cross_review logger.info("BatchDedup Cross-group complete: %d linked, %d review", cross_linked, cross_review) # ── Qdrant Helpers ─────────────────────────────────────────────────── async def _embed_and_index(self, control: dict): """Compute embedding and index a control in the dedup Qdrant collection.""" parts = control["merge_group_hint"].split(":", 2) action = parts[0] if len(parts) > 0 else "" obj = parts[1] if len(parts) > 1 else "" norm_action = normalize_action(action) norm_object = normalize_object(obj) canonical = canonicalize_text(action, obj, control["title"]) embedding = await get_embedding(canonical) if not embedding: return await qdrant_upsert( point_id=control["uuid"], embedding=embedding, payload={ "control_uuid": control["uuid"], "control_id": control["control_id"], "title": control["title"], "pattern_id": control.get("pattern_id"), "action_normalized": norm_action, "object_normalized": norm_object, "canonical_text": canonical, "merge_group_hint": control["merge_group_hint"], }, collection=self.collection, ) async def _index_with_embedding(self, control: dict, embedding: list): """Index a control with a pre-computed embedding.""" parts = control["merge_group_hint"].split(":", 2) action = parts[0] if len(parts) > 0 else "" obj = parts[1] if len(parts) > 1 else "" norm_action = normalize_action(action) norm_object = normalize_object(obj) canonical = canonicalize_text(action, obj, control["title"]) await qdrant_upsert( point_id=control["uuid"], embedding=embedding, payload={ "control_uuid": control["uuid"], "control_id": control["control_id"], "title": control["title"], "pattern_id": control.get("pattern_id"), "action_normalized": norm_action, "object_normalized": norm_object, "canonical_text": canonical, "merge_group_hint": control["merge_group_hint"], }, collection=self.collection, ) # ── DB Write Helpers ───────────────────────────────────────────────── async def _mark_duplicate(self, master: dict, candidate: dict, confidence: float): """Mark candidate as duplicate of master, transfer parent links.""" try: self.db.execute(text(""" UPDATE canonical_controls SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid) WHERE id = CAST(:cand AS uuid) """), {"master": master["uuid"], "cand": candidate["uuid"]}) self.db.execute(text(""" INSERT INTO control_parent_links (control_uuid, parent_control_uuid, link_type, confidence) VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf) ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING """), {"master": master["uuid"], "cand_parent": candidate["uuid"], "conf": confidence}) transferred = self._transfer_parent_links(master["uuid"], candidate["uuid"]) self.stats["parent_links_transferred"] += transferred self.db.commit() except Exception as e: logger.error("BatchDedup _mark_duplicate error %s→%s: %s", candidate["uuid"], master["uuid"], e) self.db.rollback() raise async def _mark_duplicate_to(self, master_uuid: str, candidate: dict, confidence: float): """Mark candidate as duplicate of a Qdrant-matched master.""" try: self.db.execute(text(""" UPDATE canonical_controls SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid) WHERE id = CAST(:cand AS uuid) """), {"master": master_uuid, "cand": candidate["uuid"]}) self.db.execute(text(""" INSERT INTO control_parent_links (control_uuid, parent_control_uuid, link_type, confidence) VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf) ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING """), {"master": master_uuid, "cand_parent": candidate["uuid"], "conf": confidence}) transferred = self._transfer_parent_links(master_uuid, candidate["uuid"]) self.stats["parent_links_transferred"] += transferred self.db.commit() except Exception as e: logger.error("BatchDedup _mark_duplicate_to error %s→%s: %s", candidate["uuid"], master_uuid, e) self.db.rollback() raise def _transfer_parent_links(self, master_uuid: str, duplicate_uuid: str) -> int: """Move existing parent links from duplicate to master.""" rows = self.db.execute(text(""" SELECT parent_control_uuid::text, link_type, confidence, source_regulation, source_article, obligation_candidate_id::text FROM control_parent_links WHERE control_uuid = CAST(:dup AS uuid) AND link_type = 'decomposition' """), {"dup": duplicate_uuid}).fetchall() transferred = 0 for r in rows: parent_uuid = r[0] if parent_uuid == master_uuid: continue self.db.execute(text(""" INSERT INTO control_parent_links (control_uuid, parent_control_uuid, link_type, confidence, source_regulation, source_article, obligation_candidate_id) VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), :lt, :conf, :sr, :sa, CAST(:oci AS uuid)) ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING """), { "cu": master_uuid, "pu": parent_uuid, "lt": r[1], "conf": float(r[2]) if r[2] else 1.0, "sr": r[3], "sa": r[4], "oci": r[5], }) transferred += 1 return transferred def _write_review(self, candidate: dict, matched_payload: dict, score: float): """Write a dedup review entry for borderline matches.""" try: self.db.execute(text(""" INSERT INTO control_dedup_reviews (candidate_control_id, candidate_title, candidate_objective, matched_control_uuid, matched_control_id, similarity_score, dedup_stage, dedup_details) VALUES (:ccid, :ct, :co, CAST(:mcu AS uuid), :mci, :ss, 'batch_dedup', CAST(:dd AS jsonb)) """), { "ccid": candidate["control_id"], "ct": candidate["title"], "co": candidate.get("objective", ""), "mcu": matched_payload.get("control_uuid"), "mci": matched_payload.get("control_id"), "ss": score, "dd": json.dumps({ "merge_group_hint": candidate.get("merge_group_hint", ""), "pattern_id": candidate.get("pattern_id"), }), }) self.db.commit() except Exception as e: logger.error("BatchDedup _write_review error: %s", e) self.db.rollback() raise # ── Progress ───────────────────────────────────────────────────────── def _log_progress(self, hint: str): """Log progress every 500 controls.""" if self._progress_count > 0 and self._progress_count % 500 == 0: logger.info( "BatchDedup [%s] %d/%d — masters=%d, linked=%d, review=%d", self._progress_phase, self._progress_count, self._progress_total, self.stats["masters"], self.stats["linked"], self.stats["review"], ) def get_status(self) -> dict: """Return current progress stats (for status endpoint).""" return { "phase": self._progress_phase, "progress": self._progress_count, "total": self._progress_total, **self.stats, }