"""Tests for Batch Dedup Runner (batch_dedup_runner.py). Covers: - quality_score(): Richness ranking - BatchDedupRunner._sub_group_by_merge_hint(): Composite key grouping - Master selection (highest quality score wins) - Duplicate linking (mark + parent-link transfer) - Dry run mode (no DB changes) - Cross-group pass - Progress reporting / stats """ import json import pytest from unittest.mock import MagicMock, AsyncMock, patch, call from compliance.services.batch_dedup_runner import ( quality_score, BatchDedupRunner, DEDUP_COLLECTION, ) # --------------------------------------------------------------------------- # quality_score TESTS # --------------------------------------------------------------------------- class TestQualityScore: """Quality scoring: richer controls should score higher.""" def test_empty_control(self): score = quality_score({}) assert score == 0.0 def test_requirements_weight(self): score = quality_score({"requirements": json.dumps(["r1", "r2", "r3"])}) assert score == pytest.approx(6.0) # 3 * 2.0 def test_test_procedure_weight(self): score = quality_score({"test_procedure": json.dumps(["t1", "t2"])}) assert score == pytest.approx(3.0) # 2 * 1.5 def test_evidence_weight(self): score = quality_score({"evidence": json.dumps(["e1"])}) assert score == pytest.approx(1.0) # 1 * 1.0 def test_objective_weight_capped(self): short = quality_score({"objective": "x" * 100}) long = quality_score({"objective": "x" * 1000}) assert short == pytest.approx(0.5) # 100/200 assert long == pytest.approx(3.0) # capped at 3.0 def test_combined_score(self): control = { "requirements": json.dumps(["r1", "r2"]), "test_procedure": json.dumps(["t1"]), "evidence": json.dumps(["e1", "e2"]), "objective": "x" * 400, } # 2*2 + 1*1.5 + 2*1.0 + min(400/200, 3) = 4 + 1.5 + 2 + 2 = 9.5 assert quality_score(control) == pytest.approx(9.5) def test_json_string_vs_list(self): """Both JSON strings and already-parsed lists should work.""" a = quality_score({"requirements": json.dumps(["r1", "r2"])}) b = quality_score({"requirements": '["r1", "r2"]'}) assert a == b def test_null_fields(self): """None values should not crash.""" score = quality_score({ "requirements": None, "test_procedure": None, "evidence": None, "objective": None, }) assert score == 0.0 def test_ranking_order(self): """Rich control should rank above sparse control.""" rich = { "requirements": json.dumps(["r1", "r2", "r3"]), "test_procedure": json.dumps(["t1", "t2"]), "evidence": json.dumps(["e1"]), "objective": "A comprehensive objective for this control.", } sparse = { "requirements": json.dumps(["r1"]), "objective": "Short", } assert quality_score(rich) > quality_score(sparse) # --------------------------------------------------------------------------- # Sub-grouping TESTS # --------------------------------------------------------------------------- class TestSubGrouping: def _make_runner(self): db = MagicMock() return BatchDedupRunner(db=db) def test_groups_by_merge_hint(self): runner = self._make_runner() controls = [ {"uuid": "a", "merge_group_hint": "implement:mfa:none"}, {"uuid": "b", "merge_group_hint": "implement:mfa:none"}, {"uuid": "c", "merge_group_hint": "test:firewall:periodic"}, ] groups = runner._sub_group_by_merge_hint(controls) assert len(groups) == 2 assert len(groups["implement:mfa:none"]) == 2 assert len(groups["test:firewall:periodic"]) == 1 def test_empty_hint_gets_own_group(self): runner = self._make_runner() controls = [ {"uuid": "x", "merge_group_hint": ""}, {"uuid": "y", "merge_group_hint": ""}, ] groups = runner._sub_group_by_merge_hint(controls) # Each empty-hint control gets its own group assert len(groups) == 2 def test_single_control_single_group(self): runner = self._make_runner() controls = [ {"uuid": "a", "merge_group_hint": "implement:mfa:none"}, ] groups = runner._sub_group_by_merge_hint(controls) assert len(groups) == 1 # --------------------------------------------------------------------------- # Master Selection TESTS # --------------------------------------------------------------------------- class TestMasterSelection: """Best quality score should become master.""" @pytest.mark.asyncio async def test_highest_score_is_master(self): """In a group, the control with highest quality_score is master.""" db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() # Mock parent link transfer query db.execute.return_value.fetchall.return_value = [] runner = BatchDedupRunner(db=db) sparse = _make_control("s1", reqs=1, hint="implement:mfa:none", title="MFA implementiert") rich = _make_control("r1", reqs=5, tests=3, evidence=2, hint="implement:mfa:none", title="MFA implementiert") medium = _make_control("m1", reqs=2, tests=1, hint="implement:mfa:none", title="MFA implementiert") controls = [sparse, medium, rich] # All have same title → all should be title-identical linked with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock, return_value=[0.1] * 1024), \ patch("compliance.services.batch_dedup_runner.qdrant_upsert", new_callable=AsyncMock, return_value=True): await runner._process_hint_group("implement:mfa:none", controls, dry_run=True) # Rich should be master (1 master), others linked (2 linked) assert runner.stats["masters"] == 1 assert runner.stats["linked"] == 2 assert runner.stats["skipped_title_identical"] == 2 # --------------------------------------------------------------------------- # Dry Run TESTS # --------------------------------------------------------------------------- class TestDryRun: """Dry run should compute stats but NOT modify DB.""" @pytest.mark.asyncio async def test_dry_run_no_db_writes(self): db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() runner = BatchDedupRunner(db=db) controls = [ _make_control("a", reqs=3, hint="implement:mfa:none", title="MFA impl"), _make_control("b", reqs=1, hint="implement:mfa:none", title="MFA impl"), ] with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock, return_value=[0.1] * 1024), \ patch("compliance.services.batch_dedup_runner.qdrant_upsert", new_callable=AsyncMock, return_value=True): await runner._process_hint_group("implement:mfa:none", controls, dry_run=True) assert runner.stats["masters"] == 1 assert runner.stats["linked"] == 1 # No commit for dedup operations in dry_run db.commit.assert_not_called() # --------------------------------------------------------------------------- # Parent Link Transfer TESTS # --------------------------------------------------------------------------- class TestParentLinkTransfer: """Parent links should migrate from duplicate to master.""" def test_transfer_parent_links(self): db = MagicMock() # Mock: duplicate has 2 parent links db.execute.return_value.fetchall.return_value = [ ("parent-1", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"), ("parent-2", "decomposition", 0.9, "NIS2", "Art. 21", "obl-2"), ] runner = BatchDedupRunner(db=db) count = runner._transfer_parent_links("master-uuid", "dup-uuid") assert count == 2 # Two INSERT calls for the transferred links assert db.execute.call_count == 3 # 1 SELECT + 2 INSERTs def test_transfer_skips_self_reference(self): db = MagicMock() # Parent link points to master itself → should be skipped db.execute.return_value.fetchall.return_value = [ ("master-uuid", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"), ] runner = BatchDedupRunner(db=db) count = runner._transfer_parent_links("master-uuid", "dup-uuid") assert count == 0 # --------------------------------------------------------------------------- # Title-identical Short-circuit TESTS # --------------------------------------------------------------------------- class TestTitleIdenticalShortCircuit: @pytest.mark.asyncio async def test_identical_titles_skip_embedding(self): """Controls with identical titles in same hint group → direct link.""" db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() db.execute.return_value.fetchall.return_value = [] runner = BatchDedupRunner(db=db) controls = [ _make_control("m", reqs=3, hint="implement:mfa:none", title="MFA implementieren"), _make_control("c", reqs=1, hint="implement:mfa:none", title="MFA implementieren"), ] with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock) as mock_embed, \ patch("compliance.services.batch_dedup_runner.qdrant_upsert", new_callable=AsyncMock, return_value=True): await runner._process_hint_group("implement:mfa:none", controls, dry_run=False) # Embedding should only be called for the master (indexing), not for linking assert runner.stats["linked"] == 1 assert runner.stats["skipped_title_identical"] == 1 @pytest.mark.asyncio async def test_different_titles_use_embedding(self): """Controls with different titles should use embedding check.""" db = MagicMock() db.execute = MagicMock() db.commit = MagicMock() db.execute.return_value.fetchall.return_value = [] runner = BatchDedupRunner(db=db) controls = [ _make_control("m", reqs=3, hint="implement:mfa:none", title="MFA implementieren fuer Admins"), _make_control("c", reqs=1, hint="implement:mfa:none", title="MFA einrichten fuer alle Benutzer"), ] with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock, return_value=[0.1] * 1024) as mock_embed, \ patch("compliance.services.batch_dedup_runner.qdrant_upsert", new_callable=AsyncMock, return_value=True), \ patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation", new_callable=AsyncMock, return_value=[]): await runner._process_hint_group("implement:mfa:none", controls, dry_run=False) # Different titles → embedding was called for both (master + candidate) assert mock_embed.call_count >= 2 # No Qdrant results → linked anyway (same hint = same action+object) assert runner.stats["linked"] == 1 # --------------------------------------------------------------------------- # Cross-Group Pass TESTS # --------------------------------------------------------------------------- class TestCrossGroupPass: @pytest.mark.asyncio async def test_cross_group_creates_link(self): db = MagicMock() db.commit = MagicMock() # First call returns masters, subsequent calls return empty (for transfer) master_rows = [ ("uuid-1", "CTRL-001", "MFA implementieren", "implement:multi_factor_auth:none"), ] call_count = {"n": 0} def mock_execute(stmt, params=None): result = MagicMock() call_count["n"] += 1 if call_count["n"] == 1: result.fetchall.return_value = master_rows else: result.fetchall.return_value = [] return result db.execute = mock_execute runner = BatchDedupRunner(db=db) cross_result = [{ "score": 0.95, "payload": { "control_uuid": "uuid-2", "control_id": "CTRL-002", "merge_group_hint": "implement:mfa:continuous", }, }] with patch("compliance.services.batch_dedup_runner.get_embedding", new_callable=AsyncMock, return_value=[0.1] * 1024), \ patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation", new_callable=AsyncMock, return_value=cross_result): await runner._run_cross_group_pass() assert runner.stats["cross_group_linked"] == 1 # --------------------------------------------------------------------------- # Progress Stats TESTS # --------------------------------------------------------------------------- class TestProgressStats: def test_get_status(self): db = MagicMock() runner = BatchDedupRunner(db=db) runner.stats["masters"] = 42 runner.stats["linked"] = 100 runner._progress_phase = "phase1" runner._progress_count = 500 runner._progress_total = 85000 status = runner.get_status() assert status["phase"] == "phase1" assert status["progress"] == 500 assert status["total"] == 85000 assert status["masters"] == 42 assert status["linked"] == 100 # --------------------------------------------------------------------------- # Route endpoint TESTS # --------------------------------------------------------------------------- class TestBatchDedupRoutes: """Test the batch-dedup API endpoints.""" def test_status_endpoint_not_running(self): from fastapi import FastAPI from fastapi.testclient import TestClient from compliance.api.crosswalk_routes import router app = FastAPI() app.include_router(router, prefix="/api/compliance") client = TestClient(app) with patch("compliance.api.crosswalk_routes.SessionLocal") as mock_session: mock_db = MagicMock() mock_session.return_value = mock_db mock_db.execute.return_value.fetchone.return_value = (85000, 0, 85000) resp = client.get("/api/compliance/v1/canonical/migrate/batch-dedup/status") assert resp.status_code == 200 data = resp.json() assert data["running"] is False # --------------------------------------------------------------------------- # HELPERS # --------------------------------------------------------------------------- def _make_control( prefix: str, reqs: int = 0, tests: int = 0, evidence: int = 0, hint: str = "", title: str = None, pattern_id: str = None, ) -> dict: """Build a mock control dict for testing.""" return { "uuid": f"{prefix}-uuid", "control_id": f"CTRL-{prefix}", "title": title or f"Control {prefix}", "objective": f"Objective for {prefix}", "pattern_id": pattern_id, "requirements": json.dumps([f"r{i}" for i in range(reqs)]), "test_procedure": json.dumps([f"t{i}" for i in range(tests)]), "evidence": json.dumps([f"e{i}" for i in range(evidence)]), "release_state": "draft", "merge_group_hint": hint, "action_object_class": "", }