Files
breakpilot-compliance/backend-compliance/tests/test_batch_dedup_runner.py
Benjamin Admin 770f0b5ab0
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 31s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s
fix: adapt batch dedup to NULL pattern_id — group by merge_group_hint
All Pass 0b controls have pattern_id=NULL. Rewritten to:
- Phase 1: Group by merge_group_hint (action:object:trigger), 52k groups
- Phase 2: Cross-group embedding search for semantically similar masters
- Qdrant search uses unfiltered cross-regulation endpoint
- API param changed: pattern_id → hint_filter

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 07:24:02 +01:00

441 lines
16 KiB
Python

"""Tests for Batch Dedup Runner (batch_dedup_runner.py).
Covers:
- quality_score(): Richness ranking
- BatchDedupRunner._sub_group_by_merge_hint(): Composite key grouping
- Master selection (highest quality score wins)
- Duplicate linking (mark + parent-link transfer)
- Dry run mode (no DB changes)
- Cross-group pass
- Progress reporting / stats
"""
import json
import pytest
from unittest.mock import MagicMock, AsyncMock, patch, call
from compliance.services.batch_dedup_runner import (
quality_score,
BatchDedupRunner,
DEDUP_COLLECTION,
)
# ---------------------------------------------------------------------------
# quality_score TESTS
# ---------------------------------------------------------------------------
class TestQualityScore:
"""Quality scoring: richer controls should score higher."""
def test_empty_control(self):
score = quality_score({})
assert score == 0.0
def test_requirements_weight(self):
score = quality_score({"requirements": json.dumps(["r1", "r2", "r3"])})
assert score == pytest.approx(6.0) # 3 * 2.0
def test_test_procedure_weight(self):
score = quality_score({"test_procedure": json.dumps(["t1", "t2"])})
assert score == pytest.approx(3.0) # 2 * 1.5
def test_evidence_weight(self):
score = quality_score({"evidence": json.dumps(["e1"])})
assert score == pytest.approx(1.0) # 1 * 1.0
def test_objective_weight_capped(self):
short = quality_score({"objective": "x" * 100})
long = quality_score({"objective": "x" * 1000})
assert short == pytest.approx(0.5) # 100/200
assert long == pytest.approx(3.0) # capped at 3.0
def test_combined_score(self):
control = {
"requirements": json.dumps(["r1", "r2"]),
"test_procedure": json.dumps(["t1"]),
"evidence": json.dumps(["e1", "e2"]),
"objective": "x" * 400,
}
# 2*2 + 1*1.5 + 2*1.0 + min(400/200, 3) = 4 + 1.5 + 2 + 2 = 9.5
assert quality_score(control) == pytest.approx(9.5)
def test_json_string_vs_list(self):
"""Both JSON strings and already-parsed lists should work."""
a = quality_score({"requirements": json.dumps(["r1", "r2"])})
b = quality_score({"requirements": '["r1", "r2"]'})
assert a == b
def test_null_fields(self):
"""None values should not crash."""
score = quality_score({
"requirements": None,
"test_procedure": None,
"evidence": None,
"objective": None,
})
assert score == 0.0
def test_ranking_order(self):
"""Rich control should rank above sparse control."""
rich = {
"requirements": json.dumps(["r1", "r2", "r3"]),
"test_procedure": json.dumps(["t1", "t2"]),
"evidence": json.dumps(["e1"]),
"objective": "A comprehensive objective for this control.",
}
sparse = {
"requirements": json.dumps(["r1"]),
"objective": "Short",
}
assert quality_score(rich) > quality_score(sparse)
# ---------------------------------------------------------------------------
# Sub-grouping TESTS
# ---------------------------------------------------------------------------
class TestSubGrouping:
def _make_runner(self):
db = MagicMock()
return BatchDedupRunner(db=db)
def test_groups_by_merge_hint(self):
runner = self._make_runner()
controls = [
{"uuid": "a", "merge_group_hint": "implement:mfa:none"},
{"uuid": "b", "merge_group_hint": "implement:mfa:none"},
{"uuid": "c", "merge_group_hint": "test:firewall:periodic"},
]
groups = runner._sub_group_by_merge_hint(controls)
assert len(groups) == 2
assert len(groups["implement:mfa:none"]) == 2
assert len(groups["test:firewall:periodic"]) == 1
def test_empty_hint_gets_own_group(self):
runner = self._make_runner()
controls = [
{"uuid": "x", "merge_group_hint": ""},
{"uuid": "y", "merge_group_hint": ""},
]
groups = runner._sub_group_by_merge_hint(controls)
# Each empty-hint control gets its own group
assert len(groups) == 2
def test_single_control_single_group(self):
runner = self._make_runner()
controls = [
{"uuid": "a", "merge_group_hint": "implement:mfa:none"},
]
groups = runner._sub_group_by_merge_hint(controls)
assert len(groups) == 1
# ---------------------------------------------------------------------------
# Master Selection TESTS
# ---------------------------------------------------------------------------
class TestMasterSelection:
"""Best quality score should become master."""
@pytest.mark.asyncio
async def test_highest_score_is_master(self):
"""In a group, the control with highest quality_score is master."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
# Mock parent link transfer query
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none",
title="MFA implementiert")
rich = _make_control("r1", reqs=5, tests=3, evidence=2,
hint="implement:mfa:none", title="MFA implementiert")
medium = _make_control("m1", reqs=2, tests=1,
hint="implement:mfa:none", title="MFA implementiert")
controls = [sparse, medium, rich]
# All have same title → all should be title-identical linked
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
# Rich should be master (1 master), others linked (2 linked)
assert runner.stats["masters"] == 1
assert runner.stats["linked"] == 2
assert runner.stats["skipped_title_identical"] == 2
# ---------------------------------------------------------------------------
# Dry Run TESTS
# ---------------------------------------------------------------------------
class TestDryRun:
"""Dry run should compute stats but NOT modify DB."""
@pytest.mark.asyncio
async def test_dry_run_no_db_writes(self):
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
runner = BatchDedupRunner(db=db)
controls = [
_make_control("a", reqs=3, hint="implement:mfa:none", title="MFA impl"),
_make_control("b", reqs=1, hint="implement:mfa:none", title="MFA impl"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
assert runner.stats["masters"] == 1
assert runner.stats["linked"] == 1
# No commit for dedup operations in dry_run
db.commit.assert_not_called()
# ---------------------------------------------------------------------------
# Parent Link Transfer TESTS
# ---------------------------------------------------------------------------
class TestParentLinkTransfer:
"""Parent links should migrate from duplicate to master."""
def test_transfer_parent_links(self):
db = MagicMock()
# Mock: duplicate has 2 parent links
db.execute.return_value.fetchall.return_value = [
("parent-1", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
("parent-2", "decomposition", 0.9, "NIS2", "Art. 21", "obl-2"),
]
runner = BatchDedupRunner(db=db)
count = runner._transfer_parent_links("master-uuid", "dup-uuid")
assert count == 2
# Two INSERT calls for the transferred links
assert db.execute.call_count == 3 # 1 SELECT + 2 INSERTs
def test_transfer_skips_self_reference(self):
db = MagicMock()
# Parent link points to master itself → should be skipped
db.execute.return_value.fetchall.return_value = [
("master-uuid", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
]
runner = BatchDedupRunner(db=db)
count = runner._transfer_parent_links("master-uuid", "dup-uuid")
assert count == 0
# ---------------------------------------------------------------------------
# Title-identical Short-circuit TESTS
# ---------------------------------------------------------------------------
class TestTitleIdenticalShortCircuit:
@pytest.mark.asyncio
async def test_identical_titles_skip_embedding(self):
"""Controls with identical titles in same hint group → direct link."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
controls = [
_make_control("m", reqs=3, hint="implement:mfa:none",
title="MFA implementieren"),
_make_control("c", reqs=1, hint="implement:mfa:none",
title="MFA implementieren"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock) as mock_embed, \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
# Embedding should only be called for the master (indexing), not for linking
assert runner.stats["linked"] == 1
assert runner.stats["skipped_title_identical"] == 1
@pytest.mark.asyncio
async def test_different_titles_use_embedding(self):
"""Controls with different titles should use embedding check."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
controls = [
_make_control("m", reqs=3, hint="implement:mfa:none",
title="MFA implementieren fuer Admins"),
_make_control("c", reqs=1, hint="implement:mfa:none",
title="MFA einrichten fuer alle Benutzer"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024) as mock_embed, \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True), \
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
new_callable=AsyncMock, return_value=[]):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
# Different titles → embedding was called for both (master + candidate)
assert mock_embed.call_count >= 2
# No Qdrant results → linked anyway (same hint = same action+object)
assert runner.stats["linked"] == 1
# ---------------------------------------------------------------------------
# Cross-Group Pass TESTS
# ---------------------------------------------------------------------------
class TestCrossGroupPass:
@pytest.mark.asyncio
async def test_cross_group_creates_link(self):
db = MagicMock()
db.commit = MagicMock()
# First call returns masters, subsequent calls return empty (for transfer)
master_rows = [
("uuid-1", "CTRL-001", "MFA implementieren",
"implement:multi_factor_auth:none"),
]
call_count = {"n": 0}
def mock_execute(stmt, params=None):
result = MagicMock()
call_count["n"] += 1
if call_count["n"] == 1:
result.fetchall.return_value = master_rows
else:
result.fetchall.return_value = []
return result
db.execute = mock_execute
runner = BatchDedupRunner(db=db)
cross_result = [{
"score": 0.95,
"payload": {
"control_uuid": "uuid-2",
"control_id": "CTRL-002",
"merge_group_hint": "implement:mfa:continuous",
},
}]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
new_callable=AsyncMock, return_value=cross_result):
await runner._run_cross_group_pass()
assert runner.stats["cross_group_linked"] == 1
# ---------------------------------------------------------------------------
# Progress Stats TESTS
# ---------------------------------------------------------------------------
class TestProgressStats:
def test_get_status(self):
db = MagicMock()
runner = BatchDedupRunner(db=db)
runner.stats["masters"] = 42
runner.stats["linked"] = 100
runner._progress_phase = "phase1"
runner._progress_count = 500
runner._progress_total = 85000
status = runner.get_status()
assert status["phase"] == "phase1"
assert status["progress"] == 500
assert status["total"] == 85000
assert status["masters"] == 42
assert status["linked"] == 100
# ---------------------------------------------------------------------------
# Route endpoint TESTS
# ---------------------------------------------------------------------------
class TestBatchDedupRoutes:
"""Test the batch-dedup API endpoints."""
def test_status_endpoint_not_running(self):
from fastapi import FastAPI
from fastapi.testclient import TestClient
from compliance.api.crosswalk_routes import router
app = FastAPI()
app.include_router(router, prefix="/api/compliance")
client = TestClient(app)
with patch("compliance.api.crosswalk_routes.SessionLocal") as mock_session:
mock_db = MagicMock()
mock_session.return_value = mock_db
mock_db.execute.return_value.fetchone.return_value = (85000, 0, 85000)
resp = client.get("/api/compliance/v1/canonical/migrate/batch-dedup/status")
assert resp.status_code == 200
data = resp.json()
assert data["running"] is False
# ---------------------------------------------------------------------------
# HELPERS
# ---------------------------------------------------------------------------
def _make_control(
prefix: str,
reqs: int = 0,
tests: int = 0,
evidence: int = 0,
hint: str = "",
title: str = None,
pattern_id: str = None,
) -> dict:
"""Build a mock control dict for testing."""
return {
"uuid": f"{prefix}-uuid",
"control_id": f"CTRL-{prefix}",
"title": title or f"Control {prefix}",
"objective": f"Objective for {prefix}",
"pattern_id": pattern_id,
"requirements": json.dumps([f"r{i}" for i in range(reqs)]),
"test_procedure": json.dumps([f"t{i}" for i in range(tests)]),
"evidence": json.dumps([f"e{i}" for i in range(evidence)]),
"release_state": "draft",
"merge_group_hint": hint,
"action_object_class": "",
}