breakpilot-compliance/backend-compliance/compliance/services/batch_dedup_runner.py

"""Batch Dedup Runner — Orchestrates deduplication of ~85k atomare Controls.

Reduces Pass 0b controls from ~85k to ~18-25k unique Master Controls via:
  Phase 1: Intra-Group Dedup — same merge_group_hint → pick best, link rest
           (85k → ~52k, mostly title-identical short-circuit, no embeddings)
  Phase 2: Cross-Group Dedup — embed masters, search Qdrant for similar
           masters with different hints (52k → ~18-25k)

All Pass 0b controls have pattern_id=NULL. The primary grouping key is
merge_group_hint (format: "action_type:norm_obj:trigger_key"), which
encodes the normalized action, object, and trigger.

Usage:
    runner = BatchDedupRunner(db)
    stats = await runner.run(dry_run=True)       # preview
    stats = await runner.run(dry_run=False)       # execute
    stats = await runner.run(hint_filter="implement:multi_factor_auth:none")
"""

import json
import logging
import time
from collections import defaultdict

from sqlalchemy import text

from compliance.services.control_dedup import (
    canonicalize_text,
    ensure_qdrant_collection,
    get_embedding,
    normalize_action,
    normalize_object,
    qdrant_search_cross_regulation,
    qdrant_upsert,
    LINK_THRESHOLD,
    REVIEW_THRESHOLD,
)

logger = logging.getLogger(__name__)

DEDUP_COLLECTION = "atomic_controls_dedup"


# ── Quality Score ────────────────────────────────────────────────────────


def quality_score(control: dict) -> float:
    """Score a control by richness of requirements, tests, evidence, and objective.

    Higher score = better candidate for master control.
    """
    score = 0.0

    reqs = control.get("requirements") or "[]"
    if isinstance(reqs, str):
        try:
            reqs = json.loads(reqs)
        except (json.JSONDecodeError, TypeError):
            reqs = []
    score += len(reqs) * 2.0

    tests = control.get("test_procedure") or "[]"
    if isinstance(tests, str):
        try:
            tests = json.loads(tests)
        except (json.JSONDecodeError, TypeError):
            tests = []
    score += len(tests) * 1.5

    evidence = control.get("evidence") or "[]"
    if isinstance(evidence, str):
        try:
            evidence = json.loads(evidence)
        except (json.JSONDecodeError, TypeError):
            evidence = []
    score += len(evidence) * 1.0

    objective = control.get("objective") or ""
    score += min(len(objective) / 200, 3.0)

    return score


# ── Batch Dedup Runner ───────────────────────────────────────────────────


class BatchDedupRunner:
    """Batch dedup orchestrator for existing Pass 0b atomic controls."""

    def __init__(self, db, collection: str = DEDUP_COLLECTION):
        self.db = db
        self.collection = collection
        self.stats = {
            "total_controls": 0,
            "unique_hints": 0,
            "phase1_groups_processed": 0,
            "masters": 0,
            "linked": 0,
            "review": 0,
            "new_controls": 0,
            "parent_links_transferred": 0,
            "cross_group_linked": 0,
            "cross_group_review": 0,
            "errors": 0,
            "skipped_title_identical": 0,
        }
        self._progress_phase = ""
        self._progress_count = 0
        self._progress_total = 0

    async def run(
        self,
        dry_run: bool = False,
        hint_filter: str = None,
    ) -> dict:
        """Run the full batch dedup pipeline.

        Args:
            dry_run: If True, compute stats but don't modify DB/Qdrant.
            hint_filter: If set, only process groups matching this hint prefix.

        Returns:
            Stats dict with counts.
        """
        start = time.monotonic()
        logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)",
                     dry_run, hint_filter)

        if not dry_run:
            await ensure_qdrant_collection(collection=self.collection)

        # Phase 1: Intra-group dedup (same merge_group_hint)
        self._progress_phase = "phase1"
        groups = self._load_merge_groups(hint_filter)
        self._progress_total = self.stats["total_controls"]

        for hint, controls in groups:
            try:
                await self._process_hint_group(hint, controls, dry_run)
                self.stats["phase1_groups_processed"] += 1
            except Exception as e:
                logger.error("BatchDedup Phase 1 error on hint %s: %s", hint, e)
                self.stats["errors"] += 1

        logger.info(
            "BatchDedup Phase 1 done: %d masters, %d linked, %d review",
            self.stats["masters"], self.stats["linked"], self.stats["review"],
        )

        # Phase 2: Cross-group dedup via embeddings
        if not dry_run:
            self._progress_phase = "phase2"
            await self._run_cross_group_pass()

        elapsed = time.monotonic() - start
        self.stats["elapsed_seconds"] = round(elapsed, 1)
        logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
        return self.stats

    def _load_merge_groups(self, hint_filter: str = None) -> list:
        """Load all Pass 0b controls grouped by merge_group_hint, largest first."""
        conditions = [
            "decomposition_method = 'pass0b'",
            "release_state != 'deprecated'",
            "release_state != 'duplicate'",
        ]
        params = {}

        if hint_filter:
            conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf")
            params["hf"] = f"{hint_filter}%"

        where = " AND ".join(conditions)
        rows = self.db.execute(text(f"""
            SELECT id::text, control_id, title, objective,
                   pattern_id, requirements::text, test_procedure::text,
                   evidence::text, release_state,
                   generation_metadata->>'merge_group_hint' as merge_group_hint,
                   generation_metadata->>'action_object_class' as action_object_class
            FROM canonical_controls
            WHERE {where}
            ORDER BY control_id
        """), params).fetchall()

        by_hint = defaultdict(list)
        for r in rows:
            by_hint[r[9] or ""].append({
                "uuid": r[0],
                "control_id": r[1],
                "title": r[2],
                "objective": r[3],
                "pattern_id": r[4],
                "requirements": r[5],
                "test_procedure": r[6],
                "evidence": r[7],
                "release_state": r[8],
                "merge_group_hint": r[9] or "",
                "action_object_class": r[10] or "",
            })

        self.stats["total_controls"] = len(rows)
        self.stats["unique_hints"] = len(by_hint)

        sorted_groups = sorted(by_hint.items(), key=lambda x: len(x[1]), reverse=True)
        logger.info("BatchDedup loaded %d controls in %d hint groups",
                     len(rows), len(sorted_groups))
        return sorted_groups

    def _sub_group_by_merge_hint(self, controls: list) -> dict:
        """Group controls by merge_group_hint composite key."""
        groups = defaultdict(list)
        for c in controls:
            hint = c["merge_group_hint"]
            if hint:
                groups[hint].append(c)
            else:
                groups[f"__no_hint_{c['uuid']}"].append(c)
        return dict(groups)

    async def _process_hint_group(
        self,
        hint: str,
        controls: list,
        dry_run: bool,
    ):
        """Process all controls sharing the same merge_group_hint.

        Within a hint group, all controls share action+object+trigger.
        The best-quality control becomes master, rest are linked as duplicates.
        """
        if len(controls) < 2:
            # Singleton → always master
            self.stats["masters"] += 1
            if not dry_run:
                await self._embed_and_index(controls[0])
            self._progress_count += 1
            self._log_progress(hint)
            return

        # Sort by quality score (best first)
        sorted_group = sorted(controls, key=quality_score, reverse=True)
        master = sorted_group[0]
        self.stats["masters"] += 1

        if not dry_run:
            await self._embed_and_index(master)

        for candidate in sorted_group[1:]:
            # All share the same hint → check title similarity
            if candidate["title"].strip().lower() == master["title"].strip().lower():
                # Identical title → direct link (no embedding needed)
                self.stats["linked"] += 1
                self.stats["skipped_title_identical"] += 1
                if not dry_run:
                    await self._mark_duplicate(master, candidate, confidence=1.0)
            else:
                # Different title within same hint → still likely duplicate
                # Use embedding to verify
                await self._check_and_link_within_group(master, candidate, dry_run)

            self._progress_count += 1
            self._log_progress(hint)

    async def _check_and_link_within_group(
        self,
        master: dict,
        candidate: dict,
        dry_run: bool,
    ):
        """Check if candidate (same hint group) is duplicate of master via embedding."""
        parts = candidate["merge_group_hint"].split(":", 2)
        action = parts[0] if len(parts) > 0 else ""
        obj = parts[1] if len(parts) > 1 else ""

        canonical = canonicalize_text(action, obj, candidate["title"])
        embedding = await get_embedding(canonical)

        if not embedding:
            # Can't embed → link anyway (same hint = same action+object)
            self.stats["linked"] += 1
            if not dry_run:
                await self._mark_duplicate(master, candidate, confidence=0.90)
            return

        # Search the dedup collection (unfiltered — pattern_id is NULL)
        results = await qdrant_search_cross_regulation(
            embedding, top_k=3, collection=self.collection,
        )

        if not results:
            # No Qdrant matches yet (master might not be indexed yet) → link to master
            self.stats["linked"] += 1
            if not dry_run:
                await self._mark_duplicate(master, candidate, confidence=0.90)
            return

        best = results[0]
        best_score = best.get("score", 0.0)
        best_payload = best.get("payload", {})
        best_uuid = best_payload.get("control_uuid", "")

        if best_score > LINK_THRESHOLD:
            self.stats["linked"] += 1
            if not dry_run:
                await self._mark_duplicate_to(best_uuid, candidate, confidence=best_score)
        elif best_score > REVIEW_THRESHOLD:
            self.stats["review"] += 1
            if not dry_run:
                self._write_review(candidate, best_payload, best_score)
        else:
            # Very different despite same hint → new master
            self.stats["new_controls"] += 1
            if not dry_run:
                await self._index_with_embedding(candidate, embedding)

    async def _run_cross_group_pass(self):
        """Phase 2: Find cross-group duplicates among surviving masters.

        After Phase 1, ~52k masters remain. Many have similar semantics
        despite different merge_group_hints (e.g. different German spellings).
        This pass embeds all masters and finds near-duplicates via Qdrant.
        """
        logger.info("BatchDedup Phase 2: Cross-group pass starting...")

        rows = self.db.execute(text("""
            SELECT id::text, control_id, title,
                   generation_metadata->>'merge_group_hint' as merge_group_hint
            FROM canonical_controls
            WHERE decomposition_method = 'pass0b'
              AND release_state != 'duplicate'
              AND release_state != 'deprecated'
            ORDER BY control_id
        """)).fetchall()

        self._progress_total = len(rows)
        self._progress_count = 0
        logger.info("BatchDedup Cross-group: %d masters to check", len(rows))
        cross_linked = 0
        cross_review = 0

        for i, r in enumerate(rows):
            uuid = r[0]
            hint = r[3] or ""
            parts = hint.split(":", 2)
            action = parts[0] if len(parts) > 0 else ""
            obj = parts[1] if len(parts) > 1 else ""

            canonical = canonicalize_text(action, obj, r[2])
            embedding = await get_embedding(canonical)
            if not embedding:
                continue

            results = await qdrant_search_cross_regulation(
                embedding, top_k=5, collection=self.collection,
            )
            if not results:
                continue

            # Find best match from a DIFFERENT hint group
            for match in results:
                match_score = match.get("score", 0.0)
                match_payload = match.get("payload", {})
                match_uuid = match_payload.get("control_uuid", "")

                # Skip self-match
                if match_uuid == uuid:
                    continue

                # Must be a different hint group (otherwise already handled in Phase 1)
                match_action = match_payload.get("action_normalized", "")
                match_object = match_payload.get("object_normalized", "")
                # Simple check: different control UUID is enough
                if match_score > LINK_THRESHOLD:
                    # Mark the worse one as duplicate
                    self.db.execute(text("""
                        UPDATE canonical_controls
                        SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
                        WHERE id = CAST(:dup AS uuid)
                          AND release_state != 'duplicate'
                    """), {"master": match_uuid, "dup": uuid})

                    self.db.execute(text("""
                        INSERT INTO control_parent_links
                            (control_uuid, parent_control_uuid, link_type, confidence)
                        VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
                        ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
                    """), {"cu": match_uuid, "pu": uuid, "conf": match_score})

                    # Transfer parent links
                    transferred = self._transfer_parent_links(match_uuid, uuid)
                    self.stats["parent_links_transferred"] += transferred

                    self.db.commit()
                    cross_linked += 1
                    break  # Only one cross-link per control
                elif match_score > REVIEW_THRESHOLD:
                    self._write_review(
                        {"control_id": r[1], "title": r[2], "objective": "",
                         "merge_group_hint": hint, "pattern_id": None},
                        match_payload, match_score,
                    )
                    cross_review += 1
                    break

            self._progress_count = i + 1
            if (i + 1) % 500 == 0:
                logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
                            i + 1, len(rows), cross_linked, cross_review)

        self.stats["cross_group_linked"] = cross_linked
        self.stats["cross_group_review"] = cross_review
        logger.info("BatchDedup Cross-group complete: %d linked, %d review",
                     cross_linked, cross_review)

    # ── Qdrant Helpers ───────────────────────────────────────────────────

    async def _embed_and_index(self, control: dict):
        """Compute embedding and index a control in the dedup Qdrant collection."""
        parts = control["merge_group_hint"].split(":", 2)
        action = parts[0] if len(parts) > 0 else ""
        obj = parts[1] if len(parts) > 1 else ""

        norm_action = normalize_action(action)
        norm_object = normalize_object(obj)
        canonical = canonicalize_text(action, obj, control["title"])
        embedding = await get_embedding(canonical)

        if not embedding:
            return

        await qdrant_upsert(
            point_id=control["uuid"],
            embedding=embedding,
            payload={
                "control_uuid": control["uuid"],
                "control_id": control["control_id"],
                "title": control["title"],
                "pattern_id": control.get("pattern_id"),
                "action_normalized": norm_action,
                "object_normalized": norm_object,
                "canonical_text": canonical,
                "merge_group_hint": control["merge_group_hint"],
            },
            collection=self.collection,
        )

    async def _index_with_embedding(self, control: dict, embedding: list):
        """Index a control with a pre-computed embedding."""
        parts = control["merge_group_hint"].split(":", 2)
        action = parts[0] if len(parts) > 0 else ""
        obj = parts[1] if len(parts) > 1 else ""

        norm_action = normalize_action(action)
        norm_object = normalize_object(obj)
        canonical = canonicalize_text(action, obj, control["title"])

        await qdrant_upsert(
            point_id=control["uuid"],
            embedding=embedding,
            payload={
                "control_uuid": control["uuid"],
                "control_id": control["control_id"],
                "title": control["title"],
                "pattern_id": control.get("pattern_id"),
                "action_normalized": norm_action,
                "object_normalized": norm_object,
                "canonical_text": canonical,
                "merge_group_hint": control["merge_group_hint"],
            },
            collection=self.collection,
        )

    # ── DB Write Helpers ─────────────────────────────────────────────────

    async def _mark_duplicate(self, master: dict, candidate: dict, confidence: float):
        """Mark candidate as duplicate of master, transfer parent links."""
        self.db.execute(text("""
            UPDATE canonical_controls
            SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
            WHERE id = CAST(:cand AS uuid)
        """), {"master": master["uuid"], "cand": candidate["uuid"]})

        self.db.execute(text("""
            INSERT INTO control_parent_links
                (control_uuid, parent_control_uuid, link_type, confidence)
            VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
            ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
        """), {"master": master["uuid"], "cand_parent": candidate["uuid"], "conf": confidence})

        transferred = self._transfer_parent_links(master["uuid"], candidate["uuid"])
        self.stats["parent_links_transferred"] += transferred

        self.db.commit()

    async def _mark_duplicate_to(self, master_uuid: str, candidate: dict, confidence: float):
        """Mark candidate as duplicate of a Qdrant-matched master."""
        self.db.execute(text("""
            UPDATE canonical_controls
            SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
            WHERE id = CAST(:cand AS uuid)
        """), {"master": master_uuid, "cand": candidate["uuid"]})

        self.db.execute(text("""
            INSERT INTO control_parent_links
                (control_uuid, parent_control_uuid, link_type, confidence)
            VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
            ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
        """), {"master": master_uuid, "cand_parent": candidate["uuid"], "conf": confidence})

        transferred = self._transfer_parent_links(master_uuid, candidate["uuid"])
        self.stats["parent_links_transferred"] += transferred

        self.db.commit()

    def _transfer_parent_links(self, master_uuid: str, duplicate_uuid: str) -> int:
        """Move existing parent links from duplicate to master."""
        rows = self.db.execute(text("""
            SELECT parent_control_uuid::text, link_type, confidence,
                   source_regulation, source_article, obligation_candidate_id::text
            FROM control_parent_links
            WHERE control_uuid = CAST(:dup AS uuid)
              AND link_type = 'decomposition'
        """), {"dup": duplicate_uuid}).fetchall()

        transferred = 0
        for r in rows:
            parent_uuid = r[0]
            if parent_uuid == master_uuid:
                continue
            self.db.execute(text("""
                INSERT INTO control_parent_links
                    (control_uuid, parent_control_uuid, link_type, confidence,
                     source_regulation, source_article, obligation_candidate_id)
                VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), :lt, :conf,
                        :sr, :sa, CAST(:oci AS uuid))
                ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
            """), {
                "cu": master_uuid,
                "pu": parent_uuid,
                "lt": r[1],
                "conf": float(r[2]) if r[2] else 1.0,
                "sr": r[3],
                "sa": r[4],
                "oci": r[5],
            })
            transferred += 1

        return transferred

    def _write_review(self, candidate: dict, matched_payload: dict, score: float):
        """Write a dedup review entry for borderline matches."""
        self.db.execute(text("""
            INSERT INTO control_dedup_reviews
                (candidate_control_id, candidate_title, candidate_objective,
                 matched_control_uuid, matched_control_id,
                 similarity_score, dedup_stage, dedup_details)
            VALUES (:ccid, :ct, :co, CAST(:mcu AS uuid), :mci,
                    :ss, 'batch_dedup', :dd::jsonb)
        """), {
            "ccid": candidate["control_id"],
            "ct": candidate["title"],
            "co": candidate.get("objective", ""),
            "mcu": matched_payload.get("control_uuid"),
            "mci": matched_payload.get("control_id"),
            "ss": score,
            "dd": json.dumps({
                "merge_group_hint": candidate.get("merge_group_hint", ""),
                "pattern_id": candidate.get("pattern_id"),
            }),
        })
        self.db.commit()

    # ── Progress ─────────────────────────────────────────────────────────

    def _log_progress(self, hint: str):
        """Log progress every 500 controls."""
        if self._progress_count > 0 and self._progress_count % 500 == 0:
            logger.info(
                "BatchDedup [%s] %d/%d — masters=%d, linked=%d, review=%d",
                self._progress_phase, self._progress_count, self._progress_total,
                self.stats["masters"], self.stats["linked"], self.stats["review"],
            )

    def get_status(self) -> dict:
        """Return current progress stats (for status endpoint)."""
        return {
            "phase": self._progress_phase,
            "progress": self._progress_count,
            "total": self._progress_total,
            **self.stats,
        }