feat: Batch Dedup Runner — 85k→~18-25k Master Controls

Adds batch orchestration for deduplicating ~85k Pass 0b atomic controls into ~18-25k unique masters with M:N parent linking. New files: - migrations/078_batch_dedup.sql: merged_into_uuid column, perf indexes, link_type CHECK extended for cross_regulation - batch_dedup_runner.py: BatchDedupRunner with quality scoring, merge-hint grouping, title-identical short-circuit, parent-link transfer, and cross-regulation pass - tests/test_batch_dedup_runner.py: 21 tests (all passing) Modified: - control_dedup.py: optional collection param on Qdrant functions - crosswalk_routes.py: POST/GET batch-dedup endpoints Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 07:06:38 +01:00
parent cce2707c03
commit 35784c35eb
5 changed files with 1126 additions and 10 deletions
@@ -0,0 +1,433 @@
+"""Tests for Batch Dedup Runner (batch_dedup_runner.py).
+
+Covers:
+- quality_score(): Richness ranking
+- BatchDedupRunner._sub_group_by_merge_hint(): Composite key grouping
+- Master selection (highest quality score wins)
+- Duplicate linking (mark + parent-link transfer)
+- Dry run mode (no DB changes)
+- Cross-regulation pass
+- Progress reporting / stats
+"""
+
+import json
+import pytest
+from unittest.mock import MagicMock, AsyncMock, patch, call
+
+from compliance.services.batch_dedup_runner import (
+    quality_score,
+    BatchDedupRunner,
+    DEDUP_COLLECTION,
+)
+
+
+# ---------------------------------------------------------------------------
+# quality_score TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestQualityScore:
+    """Quality scoring: richer controls should score higher."""
+
+    def test_empty_control(self):
+        score = quality_score({})
+        assert score == 0.0
+
+    def test_requirements_weight(self):
+        score = quality_score({"requirements": json.dumps(["r1", "r2", "r3"])})
+        assert score == pytest.approx(6.0)  # 3 * 2.0
+
+    def test_test_procedure_weight(self):
+        score = quality_score({"test_procedure": json.dumps(["t1", "t2"])})
+        assert score == pytest.approx(3.0)  # 2 * 1.5
+
+    def test_evidence_weight(self):
+        score = quality_score({"evidence": json.dumps(["e1"])})
+        assert score == pytest.approx(1.0)  # 1 * 1.0
+
+    def test_objective_weight_capped(self):
+        short = quality_score({"objective": "x" * 100})
+        long = quality_score({"objective": "x" * 1000})
+        assert short == pytest.approx(0.5)  # 100/200
+        assert long == pytest.approx(3.0)   # capped at 3.0
+
+    def test_combined_score(self):
+        control = {
+            "requirements": json.dumps(["r1", "r2"]),
+            "test_procedure": json.dumps(["t1"]),
+            "evidence": json.dumps(["e1", "e2"]),
+            "objective": "x" * 400,
+        }
+        # 2*2 + 1*1.5 + 2*1.0 + min(400/200, 3) = 4 + 1.5 + 2 + 2 = 9.5
+        assert quality_score(control) == pytest.approx(9.5)
+
+    def test_json_string_vs_list(self):
+        """Both JSON strings and already-parsed lists should work."""
+        a = quality_score({"requirements": json.dumps(["r1", "r2"])})
+        b = quality_score({"requirements": '["r1", "r2"]'})
+        assert a == b
+
+    def test_null_fields(self):
+        """None values should not crash."""
+        score = quality_score({
+            "requirements": None,
+            "test_procedure": None,
+            "evidence": None,
+            "objective": None,
+        })
+        assert score == 0.0
+
+    def test_ranking_order(self):
+        """Rich control should rank above sparse control."""
+        rich = {
+            "requirements": json.dumps(["r1", "r2", "r3"]),
+            "test_procedure": json.dumps(["t1", "t2"]),
+            "evidence": json.dumps(["e1"]),
+            "objective": "A comprehensive objective for this control.",
+        }
+        sparse = {
+            "requirements": json.dumps(["r1"]),
+            "objective": "Short",
+        }
+        assert quality_score(rich) > quality_score(sparse)
+
+
+# ---------------------------------------------------------------------------
+# Sub-grouping TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestSubGrouping:
+    def _make_runner(self):
+        db = MagicMock()
+        return BatchDedupRunner(db=db)
+
+    def test_groups_by_merge_hint(self):
+        runner = self._make_runner()
+        controls = [
+            {"uuid": "a", "merge_group_hint": "implement:mfa:none"},
+            {"uuid": "b", "merge_group_hint": "implement:mfa:none"},
+            {"uuid": "c", "merge_group_hint": "test:firewall:periodic"},
+        ]
+        groups = runner._sub_group_by_merge_hint(controls)
+        assert len(groups) == 2
+        assert len(groups["implement:mfa:none"]) == 2
+        assert len(groups["test:firewall:periodic"]) == 1
+
+    def test_empty_hint_gets_own_group(self):
+        runner = self._make_runner()
+        controls = [
+            {"uuid": "x", "merge_group_hint": ""},
+            {"uuid": "y", "merge_group_hint": ""},
+        ]
+        groups = runner._sub_group_by_merge_hint(controls)
+        # Each empty-hint control gets its own group
+        assert len(groups) == 2
+
+    def test_single_control_single_group(self):
+        runner = self._make_runner()
+        controls = [
+            {"uuid": "a", "merge_group_hint": "implement:mfa:none"},
+        ]
+        groups = runner._sub_group_by_merge_hint(controls)
+        assert len(groups) == 1
+
+
+# ---------------------------------------------------------------------------
+# Master Selection TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestMasterSelection:
+    """Best quality score should become master."""
+
+    @pytest.mark.asyncio
+    async def test_highest_score_is_master(self):
+        """In a group, the control with highest quality_score is master."""
+        db = MagicMock()
+        db.execute = MagicMock()
+        db.commit = MagicMock()
+
+        runner = BatchDedupRunner(db=db)
+
+        sparse = _make_control("s1", reqs=1, hint="implement:mfa:none")
+        rich = _make_control("r1", reqs=5, tests=3, evidence=2, hint="implement:mfa:none")
+        medium = _make_control("m1", reqs=2, tests=1, hint="implement:mfa:none")
+
+        controls = [sparse, medium, rich]
+
+        # Mock embedding to avoid real API calls
+        with patch("compliance.services.batch_dedup_runner.get_embedding",
+                    new_callable=AsyncMock, return_value=[0.1] * 1024), \
+             patch("compliance.services.batch_dedup_runner.qdrant_upsert",
+                    new_callable=AsyncMock, return_value=True), \
+             patch("compliance.services.batch_dedup_runner.qdrant_search",
+                    new_callable=AsyncMock, return_value=[{
+                        "score": 0.95,
+                        "payload": {"control_uuid": rich["uuid"],
+                                    "control_id": rich["control_id"]},
+                    }]):
+            await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
+
+        # Rich should be master (1 master), others linked (2 linked)
+        assert runner.stats["masters"] == 1
+        assert runner.stats["linked"] == 2
+
+
+# ---------------------------------------------------------------------------
+# Dry Run TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestDryRun:
+    """Dry run should compute stats but NOT modify DB."""
+
+    @pytest.mark.asyncio
+    async def test_dry_run_no_db_writes(self):
+        db = MagicMock()
+        db.execute = MagicMock()
+        db.commit = MagicMock()
+
+        runner = BatchDedupRunner(db=db)
+
+        controls = [
+            _make_control("a", reqs=3, hint="implement:mfa:none"),
+            _make_control("b", reqs=1, hint="implement:mfa:none"),
+        ]
+
+        with patch("compliance.services.batch_dedup_runner.get_embedding",
+                    new_callable=AsyncMock, return_value=[0.1] * 1024), \
+             patch("compliance.services.batch_dedup_runner.qdrant_upsert",
+                    new_callable=AsyncMock, return_value=True), \
+             patch("compliance.services.batch_dedup_runner.qdrant_search",
+                    new_callable=AsyncMock, return_value=[{
+                        "score": 0.95,
+                        "payload": {"control_uuid": "a-uuid",
+                                    "control_id": "AUTH-001"},
+                    }]):
+            await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
+
+        # No DB execute calls for UPDATE/INSERT (only the initial load query was mocked)
+        # In dry_run, _mark_duplicate and _embed_and_index are skipped
+        assert runner.stats["masters"] == 1
+        # qdrant_upsert should NOT have been called (dry_run skips indexing)
+        from compliance.services.batch_dedup_runner import qdrant_upsert
+        # No commit for dedup operations
+        db.commit.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Parent Link Transfer TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestParentLinkTransfer:
+    """Parent links should migrate from duplicate to master."""
+
+    def test_transfer_parent_links(self):
+        db = MagicMock()
+        # Mock: duplicate has 2 parent links
+        db.execute.return_value.fetchall.return_value = [
+            ("parent-1", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
+            ("parent-2", "decomposition", 0.9, "NIS2", "Art. 21", "obl-2"),
+        ]
+
+        runner = BatchDedupRunner(db=db)
+        count = runner._transfer_parent_links("master-uuid", "dup-uuid")
+
+        assert count == 2
+        # Two INSERT calls for the transferred links
+        assert db.execute.call_count == 3  # 1 SELECT + 2 INSERTs
+
+    def test_transfer_skips_self_reference(self):
+        db = MagicMock()
+        # Parent link points to master itself → should be skipped
+        db.execute.return_value.fetchall.return_value = [
+            ("master-uuid", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
+        ]
+
+        runner = BatchDedupRunner(db=db)
+        count = runner._transfer_parent_links("master-uuid", "dup-uuid")
+
+        assert count == 0
+
+
+# ---------------------------------------------------------------------------
+# Title-identical Short-circuit TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestTitleIdenticalShortCircuit:
+
+    @pytest.mark.asyncio
+    async def test_identical_titles_skip_embedding(self):
+        """Controls with identical titles in same merge group → direct link."""
+        db = MagicMock()
+        db.execute = MagicMock()
+        db.commit = MagicMock()
+        # Mock the parent link transfer query
+        db.execute.return_value.fetchall.return_value = []
+
+        runner = BatchDedupRunner(db=db)
+
+        master = _make_control("m", reqs=3, hint="implement:mfa:none",
+                               title="MFA implementieren")
+        candidate = _make_control("c", reqs=1, hint="implement:mfa:none",
+                                  title="MFA implementieren")
+
+        with patch("compliance.services.batch_dedup_runner.get_embedding",
+                    new_callable=AsyncMock) as mock_embed:
+            await runner._check_and_link(master, candidate, "CP-AUTH-001", dry_run=False)
+
+        # Embedding should NOT be called (title-identical short-circuit)
+        mock_embed.assert_not_called()
+        assert runner.stats["linked"] == 1
+        assert runner.stats["skipped_title_identical"] == 1
+
+
+# ---------------------------------------------------------------------------
+# Cross-Regulation Pass TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestCrossRegulationPass:
+
+    @pytest.mark.asyncio
+    async def test_cross_reg_creates_link(self):
+        db = MagicMock()
+        db.execute = MagicMock()
+        db.commit = MagicMock()
+        # First call: load masters
+        db.execute.return_value.fetchall.return_value = [
+            ("uuid-1", "AUTH-001", "MFA implementieren", "CP-AUTH-001",
+             "implement:multi_factor_auth:none"),
+        ]
+
+        runner = BatchDedupRunner(db=db)
+
+        cross_result = [{
+            "score": 0.96,
+            "payload": {
+                "control_uuid": "uuid-2",
+                "control_id": "SEC-001",
+                "pattern_id": "CP-SEC-001",  # different pattern!
+            },
+        }]
+
+        with patch("compliance.services.batch_dedup_runner.get_embedding",
+                    new_callable=AsyncMock, return_value=[0.1] * 1024), \
+             patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
+                    new_callable=AsyncMock, return_value=cross_result):
+            await runner._run_cross_regulation_pass()
+
+        assert runner.stats["cross_reg_linked"] == 1
+
+    @pytest.mark.asyncio
+    async def test_cross_reg_ignores_same_pattern(self):
+        """Cross-reg should NOT link controls from same pattern."""
+        db = MagicMock()
+        db.execute = MagicMock()
+        db.commit = MagicMock()
+        db.execute.return_value.fetchall.return_value = [
+            ("uuid-1", "AUTH-001", "MFA", "CP-AUTH-001", "implement:mfa:none"),
+        ]
+
+        runner = BatchDedupRunner(db=db)
+
+        # Match from SAME pattern
+        cross_result = [{
+            "score": 0.97,
+            "payload": {
+                "control_uuid": "uuid-3",
+                "control_id": "AUTH-002",
+                "pattern_id": "CP-AUTH-001",  # same pattern
+            },
+        }]
+
+        with patch("compliance.services.batch_dedup_runner.get_embedding",
+                    new_callable=AsyncMock, return_value=[0.1] * 1024), \
+             patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
+                    new_callable=AsyncMock, return_value=cross_result):
+            await runner._run_cross_regulation_pass()
+
+        assert runner.stats["cross_reg_linked"] == 0
+
+
+# ---------------------------------------------------------------------------
+# Progress Stats TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestProgressStats:
+
+    def test_get_status(self):
+        db = MagicMock()
+        runner = BatchDedupRunner(db=db)
+        runner.stats["masters"] = 42
+        runner.stats["linked"] = 100
+        runner._progress_pattern = "CP-AUTH-001"
+        runner._progress_count = 500
+
+        status = runner.get_status()
+        assert status["pattern"] == "CP-AUTH-001"
+        assert status["progress"] == 500
+        assert status["masters"] == 42
+        assert status["linked"] == 100
+
+
+# ---------------------------------------------------------------------------
+# Route endpoint TESTS
+# ---------------------------------------------------------------------------
+
+
+class TestBatchDedupRoutes:
+    """Test the batch-dedup API endpoints."""
+
+    def test_status_endpoint_not_running(self):
+        from fastapi import FastAPI
+        from fastapi.testclient import TestClient
+        from compliance.api.crosswalk_routes import router
+
+        app = FastAPI()
+        app.include_router(router, prefix="/api/compliance")
+        client = TestClient(app)
+
+        with patch("compliance.api.crosswalk_routes.SessionLocal") as mock_session:
+            mock_db = MagicMock()
+            mock_session.return_value = mock_db
+            mock_db.execute.return_value.fetchone.return_value = (85000, 0, 85000)
+
+            resp = client.get("/api/compliance/v1/canonical/migrate/batch-dedup/status")
+            assert resp.status_code == 200
+            data = resp.json()
+            assert data["running"] is False
+
+
+# ---------------------------------------------------------------------------
+# HELPERS
+# ---------------------------------------------------------------------------
+
+
+def _make_control(
+    prefix: str,
+    reqs: int = 0,
+    tests: int = 0,
+    evidence: int = 0,
+    hint: str = "",
+    title: str = None,
+    pattern_id: str = "CP-AUTH-001",
+) -> dict:
+    """Build a mock control dict for testing."""
+    return {
+        "uuid": f"{prefix}-uuid",
+        "control_id": f"AUTH-{prefix}",
+        "title": title or f"Control {prefix}",
+        "objective": f"Objective for {prefix}",
+        "pattern_id": pattern_id,
+        "requirements": json.dumps([f"r{i}" for i in range(reqs)]),
+        "test_procedure": json.dumps([f"t{i}" for i in range(tests)]),
+        "evidence": json.dumps([f"e{i}" for i in range(evidence)]),
+        "release_state": "draft",
+        "merge_group_hint": hint,
+        "action_object_class": "",
+    }