breakpilot-compliance/backend-compliance/tests/test_batch_dedup_runner.py

"""Tests for Batch Dedup Runner (batch_dedup_runner.py).

Covers:
- quality_score(): Richness ranking
- BatchDedupRunner._sub_group_by_merge_hint(): Composite key grouping
- Master selection (highest quality score wins)
- Duplicate linking (mark + parent-link transfer)
- Dry run mode (no DB changes)
- Cross-group pass
- Progress reporting / stats
"""

import json
import pytest
from unittest.mock import MagicMock, AsyncMock, patch, call

from compliance.services.batch_dedup_runner import (
    quality_score,
    BatchDedupRunner,
    DEDUP_COLLECTION,
)


# ---------------------------------------------------------------------------
# quality_score TESTS
# ---------------------------------------------------------------------------


class TestQualityScore:
    """Quality scoring: richer controls should score higher."""

    def test_empty_control(self):
        score = quality_score({})
        assert score == 0.0

    def test_requirements_weight(self):
        score = quality_score({"requirements": json.dumps(["r1", "r2", "r3"])})
        assert score == pytest.approx(6.0)  # 3 * 2.0

    def test_test_procedure_weight(self):
        score = quality_score({"test_procedure": json.dumps(["t1", "t2"])})
        assert score == pytest.approx(3.0)  # 2 * 1.5

    def test_evidence_weight(self):
        score = quality_score({"evidence": json.dumps(["e1"])})
        assert score == pytest.approx(1.0)  # 1 * 1.0

    def test_objective_weight_capped(self):
        short = quality_score({"objective": "x" * 100})
        long = quality_score({"objective": "x" * 1000})
        assert short == pytest.approx(0.5)  # 100/200
        assert long == pytest.approx(3.0)   # capped at 3.0

    def test_combined_score(self):
        control = {
            "requirements": json.dumps(["r1", "r2"]),
            "test_procedure": json.dumps(["t1"]),
            "evidence": json.dumps(["e1", "e2"]),
            "objective": "x" * 400,
        }
        # 2*2 + 1*1.5 + 2*1.0 + min(400/200, 3) = 4 + 1.5 + 2 + 2 = 9.5
        assert quality_score(control) == pytest.approx(9.5)

    def test_json_string_vs_list(self):
        """Both JSON strings and already-parsed lists should work."""
        a = quality_score({"requirements": json.dumps(["r1", "r2"])})
        b = quality_score({"requirements": '["r1", "r2"]'})
        assert a == b

    def test_null_fields(self):
        """None values should not crash."""
        score = quality_score({
            "requirements": None,
            "test_procedure": None,
            "evidence": None,
            "objective": None,
        })
        assert score == 0.0

    def test_ranking_order(self):
        """Rich control should rank above sparse control."""
        rich = {
            "requirements": json.dumps(["r1", "r2", "r3"]),
            "test_procedure": json.dumps(["t1", "t2"]),
            "evidence": json.dumps(["e1"]),
            "objective": "A comprehensive objective for this control.",
        }
        sparse = {
            "requirements": json.dumps(["r1"]),
            "objective": "Short",
        }
        assert quality_score(rich) > quality_score(sparse)


# ---------------------------------------------------------------------------
# Sub-grouping TESTS
# ---------------------------------------------------------------------------


class TestSubGrouping:
    def _make_runner(self):
        db = MagicMock()
        return BatchDedupRunner(db=db)

    def test_groups_by_merge_hint(self):
        runner = self._make_runner()
        controls = [
            {"uuid": "a", "merge_group_hint": "implement:mfa:none"},
            {"uuid": "b", "merge_group_hint": "implement:mfa:none"},
            {"uuid": "c", "merge_group_hint": "test:firewall:periodic"},
        ]
        groups = runner._sub_group_by_merge_hint(controls)
        assert len(groups) == 2
        assert len(groups["implement:mfa:none"]) == 2
        assert len(groups["test:firewall:periodic"]) == 1

    def test_empty_hint_gets_own_group(self):
        runner = self._make_runner()
        controls = [
            {"uuid": "x", "merge_group_hint": ""},
            {"uuid": "y", "merge_group_hint": ""},
        ]
        groups = runner._sub_group_by_merge_hint(controls)
        # Each empty-hint control gets its own group
        assert len(groups) == 2

    def test_single_control_single_group(self):
        runner = self._make_runner()
        controls = [
            {"uuid": "a", "merge_group_hint": "implement:mfa:none"},
        ]
        groups = runner._sub_group_by_merge_hint(controls)
        assert len(groups) == 1


# ---------------------------------------------------------------------------
# Master Selection TESTS
# ---------------------------------------------------------------------------


class TestMasterSelection:
    """Best quality score should become master."""

    @pytest.mark.asyncio
    async def test_highest_score_is_master(self):
        """In a group, the control with highest quality_score is master."""
        db = MagicMock()
        db.execute = MagicMock()
        db.commit = MagicMock()
        # Mock parent link transfer query
        db.execute.return_value.fetchall.return_value = []

        runner = BatchDedupRunner(db=db)

        sparse = _make_control("s1", reqs=1, hint="implement:mfa:none",
                               title="MFA implementiert")
        rich = _make_control("r1", reqs=5, tests=3, evidence=2,
                             hint="implement:mfa:none", title="MFA implementiert")
        medium = _make_control("m1", reqs=2, tests=1,
                               hint="implement:mfa:none", title="MFA implementiert")

        controls = [sparse, medium, rich]

        # All have same title → all should be title-identical linked
        with patch("compliance.services.batch_dedup_runner.get_embedding",
                    new_callable=AsyncMock, return_value=[0.1] * 1024), \
             patch("compliance.services.batch_dedup_runner.qdrant_upsert",
                    new_callable=AsyncMock, return_value=True):
            await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)

        # Rich should be master (1 master), others linked (2 linked)
        assert runner.stats["masters"] == 1
        assert runner.stats["linked"] == 2
        assert runner.stats["skipped_title_identical"] == 2


# ---------------------------------------------------------------------------
# Dry Run TESTS
# ---------------------------------------------------------------------------


class TestDryRun:
    """Dry run should compute stats but NOT modify DB."""

    @pytest.mark.asyncio
    async def test_dry_run_no_db_writes(self):
        db = MagicMock()
        db.execute = MagicMock()
        db.commit = MagicMock()

        runner = BatchDedupRunner(db=db)

        controls = [
            _make_control("a", reqs=3, hint="implement:mfa:none", title="MFA impl"),
            _make_control("b", reqs=1, hint="implement:mfa:none", title="MFA impl"),
        ]

        with patch("compliance.services.batch_dedup_runner.get_embedding",
                    new_callable=AsyncMock, return_value=[0.1] * 1024), \
             patch("compliance.services.batch_dedup_runner.qdrant_upsert",
                    new_callable=AsyncMock, return_value=True):
            await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)

        assert runner.stats["masters"] == 1
        assert runner.stats["linked"] == 1
        # No commit for dedup operations in dry_run
        db.commit.assert_not_called()


# ---------------------------------------------------------------------------
# Parent Link Transfer TESTS
# ---------------------------------------------------------------------------


class TestParentLinkTransfer:
    """Parent links should migrate from duplicate to master."""

    def test_transfer_parent_links(self):
        db = MagicMock()
        # Mock: duplicate has 2 parent links
        db.execute.return_value.fetchall.return_value = [
            ("parent-1", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
            ("parent-2", "decomposition", 0.9, "NIS2", "Art. 21", "obl-2"),
        ]

        runner = BatchDedupRunner(db=db)
        count = runner._transfer_parent_links("master-uuid", "dup-uuid")

        assert count == 2
        # Two INSERT calls for the transferred links
        assert db.execute.call_count == 3  # 1 SELECT + 2 INSERTs

    def test_transfer_skips_self_reference(self):
        db = MagicMock()
        # Parent link points to master itself → should be skipped
        db.execute.return_value.fetchall.return_value = [
            ("master-uuid", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
        ]

        runner = BatchDedupRunner(db=db)
        count = runner._transfer_parent_links("master-uuid", "dup-uuid")

        assert count == 0


# ---------------------------------------------------------------------------
# Title-identical Short-circuit TESTS
# ---------------------------------------------------------------------------


class TestTitleIdenticalShortCircuit:

    @pytest.mark.asyncio
    async def test_identical_titles_skip_embedding(self):
        """Controls with identical titles in same hint group → direct link."""
        db = MagicMock()
        db.execute = MagicMock()
        db.commit = MagicMock()
        db.execute.return_value.fetchall.return_value = []

        runner = BatchDedupRunner(db=db)

        controls = [
            _make_control("m", reqs=3, hint="implement:mfa:none",
                          title="MFA implementieren"),
            _make_control("c", reqs=1, hint="implement:mfa:none",
                          title="MFA implementieren"),
        ]

        with patch("compliance.services.batch_dedup_runner.get_embedding",
                    new_callable=AsyncMock) as mock_embed, \
             patch("compliance.services.batch_dedup_runner.qdrant_upsert",
                    new_callable=AsyncMock, return_value=True):
            await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)

        # Embedding should only be called for the master (indexing), not for linking
        assert runner.stats["linked"] == 1
        assert runner.stats["skipped_title_identical"] == 1

    @pytest.mark.asyncio
    async def test_different_titles_use_embedding(self):
        """Controls with different titles should use embedding check."""
        db = MagicMock()
        db.execute = MagicMock()
        db.commit = MagicMock()
        db.execute.return_value.fetchall.return_value = []

        runner = BatchDedupRunner(db=db)

        controls = [
            _make_control("m", reqs=3, hint="implement:mfa:none",
                          title="MFA implementieren fuer Admins"),
            _make_control("c", reqs=1, hint="implement:mfa:none",
                          title="MFA einrichten fuer alle Benutzer"),
        ]

        with patch("compliance.services.batch_dedup_runner.get_embedding",
                    new_callable=AsyncMock, return_value=[0.1] * 1024) as mock_embed, \
             patch("compliance.services.batch_dedup_runner.qdrant_upsert",
                    new_callable=AsyncMock, return_value=True), \
             patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
                    new_callable=AsyncMock, return_value=[]):
            await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)

        # Different titles → embedding was called for both (master + candidate)
        assert mock_embed.call_count >= 2
        # No Qdrant results → linked anyway (same hint = same action+object)
        assert runner.stats["linked"] == 1


# ---------------------------------------------------------------------------
# Cross-Group Pass TESTS
# ---------------------------------------------------------------------------


class TestCrossGroupPass:

    @pytest.mark.asyncio
    async def test_cross_group_creates_link(self):
        db = MagicMock()
        db.commit = MagicMock()

        # First call returns masters, subsequent calls return empty (for transfer)
        master_rows = [
            ("uuid-1", "CTRL-001", "MFA implementieren",
             "implement:multi_factor_auth:none"),
        ]
        call_count = {"n": 0}

        def mock_execute(stmt, params=None):
            result = MagicMock()
            call_count["n"] += 1
            if call_count["n"] == 1:
                result.fetchall.return_value = master_rows
            else:
                result.fetchall.return_value = []
            return result

        db.execute = mock_execute

        runner = BatchDedupRunner(db=db)

        cross_result = [{
            "score": 0.95,
            "payload": {
                "control_uuid": "uuid-2",
                "control_id": "CTRL-002",
                "merge_group_hint": "implement:mfa:continuous",
            },
        }]

        with patch("compliance.services.batch_dedup_runner.get_embedding",
                    new_callable=AsyncMock, return_value=[0.1] * 1024), \
             patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
                    new_callable=AsyncMock, return_value=cross_result):
            await runner._run_cross_group_pass()

        assert runner.stats["cross_group_linked"] == 1


# ---------------------------------------------------------------------------
# Progress Stats TESTS
# ---------------------------------------------------------------------------


class TestProgressStats:

    def test_get_status(self):
        db = MagicMock()
        runner = BatchDedupRunner(db=db)
        runner.stats["masters"] = 42
        runner.stats["linked"] = 100
        runner._progress_phase = "phase1"
        runner._progress_count = 500
        runner._progress_total = 85000

        status = runner.get_status()
        assert status["phase"] == "phase1"
        assert status["progress"] == 500
        assert status["total"] == 85000
        assert status["masters"] == 42
        assert status["linked"] == 100


# ---------------------------------------------------------------------------
# Route endpoint TESTS
# ---------------------------------------------------------------------------


class TestBatchDedupRoutes:
    """Test the batch-dedup API endpoints."""

    def test_status_endpoint_not_running(self):
        from fastapi import FastAPI
        from fastapi.testclient import TestClient
        from compliance.api.crosswalk_routes import router

        app = FastAPI()
        app.include_router(router, prefix="/api/compliance")
        client = TestClient(app)

        with patch("compliance.api.crosswalk_routes.SessionLocal") as mock_session:
            mock_db = MagicMock()
            mock_session.return_value = mock_db
            mock_db.execute.return_value.fetchone.return_value = (85000, 0, 85000)

            resp = client.get("/api/compliance/v1/canonical/migrate/batch-dedup/status")
            assert resp.status_code == 200
            data = resp.json()
            assert data["running"] is False


# ---------------------------------------------------------------------------
# HELPERS
# ---------------------------------------------------------------------------


def _make_control(
    prefix: str,
    reqs: int = 0,
    tests: int = 0,
    evidence: int = 0,
    hint: str = "",
    title: str = None,
    pattern_id: str = None,
) -> dict:
    """Build a mock control dict for testing."""
    return {
        "uuid": f"{prefix}-uuid",
        "control_id": f"CTRL-{prefix}",
        "title": title or f"Control {prefix}",
        "objective": f"Objective for {prefix}",
        "pattern_id": pattern_id,
        "requirements": json.dumps([f"r{i}" for i in range(reqs)]),
        "test_procedure": json.dumps([f"t{i}" for i in range(tests)]),
        "evidence": json.dumps([f"e{i}" for i in range(evidence)]),
        "release_state": "draft",
        "merge_group_hint": hint,
        "action_object_class": "",
    }