"""Tests for Batch Dedup Runner (batch_dedup_runner.py). Covers: - quality_score(): Richness ranking - BatchDedupRunner._sub_group_by_merge_hint(): Composite key grouping - Master selection (highest quality score wins) - Duplicate linking (mark + parent-link transfer) - Dry run mode (no DB changes) - Cross-regulation pass - Progress reporting / stats """ import json import pytest from unittest.mock import MagicMock, AsyncMock, patch, call from compliance.services.batch_dedup_runner import ( quality_score, BatchDedupRunner, DEDUP_COLLECTION, ) # --------------------------------------------------------------------------- # quality_score TESTS # --------------------------------------------------------------------------- class TestQualityScore: """Quality scoring: richer controls should score higher.""" def test_empty_control(self): score = quality_score({}) assert score == 0.0 def test_requirements_weight(self): score = quality_score({"requirements": json.dumps(["r1", "r2", "r3"])}) assert score == pytest.approx(6.0) # 3 * 2.0 def test_test_procedure_weight(self): score = quality_score({"test_procedure": json.dumps(["t1", "t2"])}) assert score == pytest.approx(3.0) # 2 * 1.5 def test_evidence_weight(self): score = quality_score({"evidence": json.dumps(["e1"])}) assert score == pytest.approx(1.0) # 1 * 1.0 def test_objective_weight_capped(self): short = quality_score({"objective": "x" * 100}) long = quality_score({"objective": "x" * 1000}) assert short == pytest.approx(0.5) # 100/200 assert long == pytest.approx(3.0) # capped at 3.0 def test_combined_score(self): control = { "requirements": json.dumps(["r1", "r2"]), "test_procedure": json.dumps(["t1"]), "evidence": json.dumps(["e1", "e2"]), "objective": "x" * 400, } # 2*2 + 1*1.5 + 2*1.0 + min(400/200, 3) = 4 + 1.5 + 2 + 2 = 9.5 assert quality_score(control) == pytest.approx(9.5) def test_json_string_vs_list(self): """Both JSON strings and already-parsed lists should work.""" a = quality_score({"requirements": json.dumps(["r1", "r2"])}) b = quality_score({"requirements": '["r1", "r2"]'}) assert a == b def test_null_fields(self): """None values should not crash.""" score = quality_score({ "requirements": None, "test_procedure": None, "evidence": None, "objective": None, }) assert score == 0.0 def test_ranking_order(self): """Rich control should rank above sparse control.""" rich = { "requirements": json.dumps(["r1", "r2", "r3"]), "test_procedure": json.dumps(["t1", "t2"]), "evidence": json.dumps(["e1"]), "objective": "A comprehensive objective for this control.", } sparse = { "requirements": json.dumps(["r1"]), "objective": "Short", } assert quality_score(rich) > quality_score(sparse) # --------------------------------------------------------------------------- # Sub-grouping TESTS # --------------------------------------------------------------------------- class TestSubGrouping: def _make_runner(self): db = MagicMock() return BatchDedupRunner(db=db) def test_groups_by_merge_hint(self): runner = self._make_runner() controls = [ {"uuid": "a", "merge_group_hint": "implement:mfa:none"}, {"uuid": "b", "merge_group_hint": "implement:mfa:none"}, {"uuid": "c", "merge_group_hint": "test:firewall:periodic"}, ] groups = runner._sub_group_by_merge_hint(controls) assert len(groups) == 2 assert len(groups["implement:mfa:none"]) == 2 assert len(groups["test:firewall:periodic"]) == 1 def test_empty_hint_gets_own_group(self): runner = self._make_runner() controls = [ {"uuid": "x", "merge_group_hint": ""}, {"uuid": "y", "merge_group_hint": ""}, ] groups = runner._sub_group_by_merge_hint(controls) # Each empty-hint control gets its own group assert len(groups) == 2 def test_single_control_single_group(self): runner = self._make_runner() controls = [ {"uuid": "a", "merge_group_hint": "implement:mfa:none"}, ] groups = runner._sub_group_by_merge_hint(controls) assert len(groups) == 1 # --------------------------------------------------------------------------- # Master Selection TESTS # --------------------------------------------------------------------------- class TestMasterSelection: """Best quality score should become master.""" @pytest.mark.asyncio async def test_highest_score_is_master(self): """In a group, the control with highest quality_score is master.""" db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() runner = BatchDedupRunner(db=db) sparse = _make_control("s1", reqs=1, hint="implement:mfa:none") rich = _make_control("r1", reqs=5, tests=3, evidence=2, hint="implement:mfa:none") medium = _make_control("m1", reqs=2, tests=1, hint="implement:mfa:none") controls = [sparse, medium, rich] # Mock embedding to avoid real API calls with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock, return_value=[0.1] * 1024), \ patch("compliance.services.batch_dedup_runner.qdrant_upsert", new_callable=AsyncMock, return_value=True), \ patch("compliance.services.batch_dedup_runner.qdrant_search", new_callable=AsyncMock, return_value=[{ "score": 0.95, "payload": {"control_uuid": rich["uuid"], "control_id": rich["control_id"]}, }]): await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True) # Rich should be master (1 master), others linked (2 linked) assert runner.stats["masters"] == 1 assert runner.stats["linked"] == 2 # --------------------------------------------------------------------------- # Dry Run TESTS # --------------------------------------------------------------------------- class TestDryRun: """Dry run should compute stats but NOT modify DB.""" @pytest.mark.asyncio async def test_dry_run_no_db_writes(self): db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() runner = BatchDedupRunner(db=db) controls = [ _make_control("a", reqs=3, hint="implement:mfa:none"), _make_control("b", reqs=1, hint="implement:mfa:none"), ] with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock, return_value=[0.1] * 1024), \ patch("compliance.services.batch_dedup_runner.qdrant_upsert", new_callable=AsyncMock, return_value=True), \ patch("compliance.services.batch_dedup_runner.qdrant_search", new_callable=AsyncMock, return_value=[{ "score": 0.95, "payload": {"control_uuid": "a-uuid", "control_id": "AUTH-001"}, }]): await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True) # No DB execute calls for UPDATE/INSERT (only the initial load query was mocked) # In dry_run, _mark_duplicate and _embed_and_index are skipped assert runner.stats["masters"] == 1 # qdrant_upsert should NOT have been called (dry_run skips indexing) from compliance.services.batch_dedup_runner import qdrant_upsert # No commit for dedup operations db.commit.assert_not_called() # --------------------------------------------------------------------------- # Parent Link Transfer TESTS # --------------------------------------------------------------------------- class TestParentLinkTransfer: """Parent links should migrate from duplicate to master.""" def test_transfer_parent_links(self): db = MagicMock() # Mock: duplicate has 2 parent links db.execute.return_value.fetchall.return_value = [ ("parent-1", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"), ("parent-2", "decomposition", 0.9, "NIS2", "Art. 21", "obl-2"), ] runner = BatchDedupRunner(db=db) count = runner._transfer_parent_links("master-uuid", "dup-uuid") assert count == 2 # Two INSERT calls for the transferred links assert db.execute.call_count == 3 # 1 SELECT + 2 INSERTs def test_transfer_skips_self_reference(self): db = MagicMock() # Parent link points to master itself → should be skipped db.execute.return_value.fetchall.return_value = [ ("master-uuid", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"), ] runner = BatchDedupRunner(db=db) count = runner._transfer_parent_links("master-uuid", "dup-uuid") assert count == 0 # --------------------------------------------------------------------------- # Title-identical Short-circuit TESTS # --------------------------------------------------------------------------- class TestTitleIdenticalShortCircuit: @pytest.mark.asyncio async def test_identical_titles_skip_embedding(self): """Controls with identical titles in same merge group → direct link.""" db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() # Mock the parent link transfer query db.execute.return_value.fetchall.return_value = [] runner = BatchDedupRunner(db=db) master = _make_control("m", reqs=3, hint="implement:mfa:none", title="MFA implementieren") candidate = _make_control("c", reqs=1, hint="implement:mfa:none", title="MFA implementieren") with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock) as mock_embed: await runner._check_and_link(master, candidate, "CP-AUTH-001", dry_run=False) # Embedding should NOT be called (title-identical short-circuit) mock_embed.assert_not_called() assert runner.stats["linked"] == 1 assert runner.stats["skipped_title_identical"] == 1 # --------------------------------------------------------------------------- # Cross-Regulation Pass TESTS # --------------------------------------------------------------------------- class TestCrossRegulationPass: @pytest.mark.asyncio async def test_cross_reg_creates_link(self): db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() # First call: load masters db.execute.return_value.fetchall.return_value = [ ("uuid-1", "AUTH-001", "MFA implementieren", "CP-AUTH-001", "implement:multi_factor_auth:none"), ] runner = BatchDedupRunner(db=db) cross_result = [{ "score": 0.96, "payload": { "control_uuid": "uuid-2", "control_id": "SEC-001", "pattern_id": "CP-SEC-001", # different pattern! }, }] with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock, return_value=[0.1] * 1024), \ patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation", new_callable=AsyncMock, return_value=cross_result): await runner._run_cross_regulation_pass() assert runner.stats["cross_reg_linked"] == 1 @pytest.mark.asyncio async def test_cross_reg_ignores_same_pattern(self): """Cross-reg should NOT link controls from same pattern.""" db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() db.execute.return_value.fetchall.return_value = [ ("uuid-1", "AUTH-001", "MFA", "CP-AUTH-001", "implement:mfa:none"), ] runner = BatchDedupRunner(db=db) # Match from SAME pattern cross_result = [{ "score": 0.97, "payload": { "control_uuid": "uuid-3", "control_id": "AUTH-002", "pattern_id": "CP-AUTH-001", # same pattern }, }] with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock, return_value=[0.1] * 1024), \ patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation", new_callable=AsyncMock, return_value=cross_result): await runner._run_cross_regulation_pass() assert runner.stats["cross_reg_linked"] == 0 # --------------------------------------------------------------------------- # Progress Stats TESTS # --------------------------------------------------------------------------- class TestProgressStats: def test_get_status(self): db = MagicMock() runner = BatchDedupRunner(db=db) runner.stats["masters"] = 42 runner.stats["linked"] = 100 runner._progress_pattern = "CP-AUTH-001" runner._progress_count = 500 status = runner.get_status() assert status["pattern"] == "CP-AUTH-001" assert status["progress"] == 500 assert status["masters"] == 42 assert status["linked"] == 100 # --------------------------------------------------------------------------- # Route endpoint TESTS # --------------------------------------------------------------------------- class TestBatchDedupRoutes: """Test the batch-dedup API endpoints.""" def test_status_endpoint_not_running(self): from fastapi import FastAPI from fastapi.testclient import TestClient from compliance.api.crosswalk_routes import router app = FastAPI() app.include_router(router, prefix="/api/compliance") client = TestClient(app) with patch("compliance.api.crosswalk_routes.SessionLocal") as mock_session: mock_db = MagicMock() mock_session.return_value = mock_db mock_db.execute.return_value.fetchone.return_value = (85000, 0, 85000) resp = client.get("/api/compliance/v1/canonical/migrate/batch-dedup/status") assert resp.status_code == 200 data = resp.json() assert data["running"] is False # --------------------------------------------------------------------------- # HELPERS # --------------------------------------------------------------------------- def _make_control( prefix: str, reqs: int = 0, tests: int = 0, evidence: int = 0, hint: str = "", title: str = None, pattern_id: str = "CP-AUTH-001", ) -> dict: """Build a mock control dict for testing.""" return { "uuid": f"{prefix}-uuid", "control_id": f"AUTH-{prefix}", "title": title or f"Control {prefix}", "objective": f"Objective for {prefix}", "pattern_id": pattern_id, "requirements": json.dumps([f"r{i}" for i in range(reqs)]), "test_procedure": json.dumps([f"t{i}" for i in range(tests)]), "evidence": json.dumps([f"e{i}" for i in range(evidence)]), "release_state": "draft", "merge_group_hint": hint, "action_object_class": "", }