feat: Batch Dedup Runner — 85k→~18-25k Master Controls
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s
Adds batch orchestration for deduplicating ~85k Pass 0b atomic controls into ~18-25k unique masters with M:N parent linking. New files: - migrations/078_batch_dedup.sql: merged_into_uuid column, perf indexes, link_type CHECK extended for cross_regulation - batch_dedup_runner.py: BatchDedupRunner with quality scoring, merge-hint grouping, title-identical short-circuit, parent-link transfer, and cross-regulation pass - tests/test_batch_dedup_runner.py: 21 tests (all passing) Modified: - control_dedup.py: optional collection param on Qdrant functions - crosswalk_routes.py: POST/GET batch-dedup endpoints Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
433
backend-compliance/tests/test_batch_dedup_runner.py
Normal file
433
backend-compliance/tests/test_batch_dedup_runner.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""Tests for Batch Dedup Runner (batch_dedup_runner.py).
|
||||
|
||||
Covers:
|
||||
- quality_score(): Richness ranking
|
||||
- BatchDedupRunner._sub_group_by_merge_hint(): Composite key grouping
|
||||
- Master selection (highest quality score wins)
|
||||
- Duplicate linking (mark + parent-link transfer)
|
||||
- Dry run mode (no DB changes)
|
||||
- Cross-regulation pass
|
||||
- Progress reporting / stats
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, AsyncMock, patch, call
|
||||
|
||||
from compliance.services.batch_dedup_runner import (
|
||||
quality_score,
|
||||
BatchDedupRunner,
|
||||
DEDUP_COLLECTION,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# quality_score TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestQualityScore:
|
||||
"""Quality scoring: richer controls should score higher."""
|
||||
|
||||
def test_empty_control(self):
|
||||
score = quality_score({})
|
||||
assert score == 0.0
|
||||
|
||||
def test_requirements_weight(self):
|
||||
score = quality_score({"requirements": json.dumps(["r1", "r2", "r3"])})
|
||||
assert score == pytest.approx(6.0) # 3 * 2.0
|
||||
|
||||
def test_test_procedure_weight(self):
|
||||
score = quality_score({"test_procedure": json.dumps(["t1", "t2"])})
|
||||
assert score == pytest.approx(3.0) # 2 * 1.5
|
||||
|
||||
def test_evidence_weight(self):
|
||||
score = quality_score({"evidence": json.dumps(["e1"])})
|
||||
assert score == pytest.approx(1.0) # 1 * 1.0
|
||||
|
||||
def test_objective_weight_capped(self):
|
||||
short = quality_score({"objective": "x" * 100})
|
||||
long = quality_score({"objective": "x" * 1000})
|
||||
assert short == pytest.approx(0.5) # 100/200
|
||||
assert long == pytest.approx(3.0) # capped at 3.0
|
||||
|
||||
def test_combined_score(self):
|
||||
control = {
|
||||
"requirements": json.dumps(["r1", "r2"]),
|
||||
"test_procedure": json.dumps(["t1"]),
|
||||
"evidence": json.dumps(["e1", "e2"]),
|
||||
"objective": "x" * 400,
|
||||
}
|
||||
# 2*2 + 1*1.5 + 2*1.0 + min(400/200, 3) = 4 + 1.5 + 2 + 2 = 9.5
|
||||
assert quality_score(control) == pytest.approx(9.5)
|
||||
|
||||
def test_json_string_vs_list(self):
|
||||
"""Both JSON strings and already-parsed lists should work."""
|
||||
a = quality_score({"requirements": json.dumps(["r1", "r2"])})
|
||||
b = quality_score({"requirements": '["r1", "r2"]'})
|
||||
assert a == b
|
||||
|
||||
def test_null_fields(self):
|
||||
"""None values should not crash."""
|
||||
score = quality_score({
|
||||
"requirements": None,
|
||||
"test_procedure": None,
|
||||
"evidence": None,
|
||||
"objective": None,
|
||||
})
|
||||
assert score == 0.0
|
||||
|
||||
def test_ranking_order(self):
|
||||
"""Rich control should rank above sparse control."""
|
||||
rich = {
|
||||
"requirements": json.dumps(["r1", "r2", "r3"]),
|
||||
"test_procedure": json.dumps(["t1", "t2"]),
|
||||
"evidence": json.dumps(["e1"]),
|
||||
"objective": "A comprehensive objective for this control.",
|
||||
}
|
||||
sparse = {
|
||||
"requirements": json.dumps(["r1"]),
|
||||
"objective": "Short",
|
||||
}
|
||||
assert quality_score(rich) > quality_score(sparse)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sub-grouping TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSubGrouping:
|
||||
def _make_runner(self):
|
||||
db = MagicMock()
|
||||
return BatchDedupRunner(db=db)
|
||||
|
||||
def test_groups_by_merge_hint(self):
|
||||
runner = self._make_runner()
|
||||
controls = [
|
||||
{"uuid": "a", "merge_group_hint": "implement:mfa:none"},
|
||||
{"uuid": "b", "merge_group_hint": "implement:mfa:none"},
|
||||
{"uuid": "c", "merge_group_hint": "test:firewall:periodic"},
|
||||
]
|
||||
groups = runner._sub_group_by_merge_hint(controls)
|
||||
assert len(groups) == 2
|
||||
assert len(groups["implement:mfa:none"]) == 2
|
||||
assert len(groups["test:firewall:periodic"]) == 1
|
||||
|
||||
def test_empty_hint_gets_own_group(self):
|
||||
runner = self._make_runner()
|
||||
controls = [
|
||||
{"uuid": "x", "merge_group_hint": ""},
|
||||
{"uuid": "y", "merge_group_hint": ""},
|
||||
]
|
||||
groups = runner._sub_group_by_merge_hint(controls)
|
||||
# Each empty-hint control gets its own group
|
||||
assert len(groups) == 2
|
||||
|
||||
def test_single_control_single_group(self):
|
||||
runner = self._make_runner()
|
||||
controls = [
|
||||
{"uuid": "a", "merge_group_hint": "implement:mfa:none"},
|
||||
]
|
||||
groups = runner._sub_group_by_merge_hint(controls)
|
||||
assert len(groups) == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Master Selection TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMasterSelection:
|
||||
"""Best quality score should become master."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_highest_score_is_master(self):
|
||||
"""In a group, the control with highest quality_score is master."""
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none")
|
||||
rich = _make_control("r1", reqs=5, tests=3, evidence=2, hint="implement:mfa:none")
|
||||
medium = _make_control("m1", reqs=2, tests=1, hint="implement:mfa:none")
|
||||
|
||||
controls = [sparse, medium, rich]
|
||||
|
||||
# Mock embedding to avoid real API calls
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||
new_callable=AsyncMock, return_value=True), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search",
|
||||
new_callable=AsyncMock, return_value=[{
|
||||
"score": 0.95,
|
||||
"payload": {"control_uuid": rich["uuid"],
|
||||
"control_id": rich["control_id"]},
|
||||
}]):
|
||||
await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
|
||||
|
||||
# Rich should be master (1 master), others linked (2 linked)
|
||||
assert runner.stats["masters"] == 1
|
||||
assert runner.stats["linked"] == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dry Run TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestDryRun:
|
||||
"""Dry run should compute stats but NOT modify DB."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dry_run_no_db_writes(self):
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
controls = [
|
||||
_make_control("a", reqs=3, hint="implement:mfa:none"),
|
||||
_make_control("b", reqs=1, hint="implement:mfa:none"),
|
||||
]
|
||||
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||
new_callable=AsyncMock, return_value=True), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search",
|
||||
new_callable=AsyncMock, return_value=[{
|
||||
"score": 0.95,
|
||||
"payload": {"control_uuid": "a-uuid",
|
||||
"control_id": "AUTH-001"},
|
||||
}]):
|
||||
await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
|
||||
|
||||
# No DB execute calls for UPDATE/INSERT (only the initial load query was mocked)
|
||||
# In dry_run, _mark_duplicate and _embed_and_index are skipped
|
||||
assert runner.stats["masters"] == 1
|
||||
# qdrant_upsert should NOT have been called (dry_run skips indexing)
|
||||
from compliance.services.batch_dedup_runner import qdrant_upsert
|
||||
# No commit for dedup operations
|
||||
db.commit.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parent Link Transfer TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestParentLinkTransfer:
|
||||
"""Parent links should migrate from duplicate to master."""
|
||||
|
||||
def test_transfer_parent_links(self):
|
||||
db = MagicMock()
|
||||
# Mock: duplicate has 2 parent links
|
||||
db.execute.return_value.fetchall.return_value = [
|
||||
("parent-1", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
|
||||
("parent-2", "decomposition", 0.9, "NIS2", "Art. 21", "obl-2"),
|
||||
]
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
count = runner._transfer_parent_links("master-uuid", "dup-uuid")
|
||||
|
||||
assert count == 2
|
||||
# Two INSERT calls for the transferred links
|
||||
assert db.execute.call_count == 3 # 1 SELECT + 2 INSERTs
|
||||
|
||||
def test_transfer_skips_self_reference(self):
|
||||
db = MagicMock()
|
||||
# Parent link points to master itself → should be skipped
|
||||
db.execute.return_value.fetchall.return_value = [
|
||||
("master-uuid", "decomposition", 1.0, "DSGVO", "Art. 32", "obl-1"),
|
||||
]
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
count = runner._transfer_parent_links("master-uuid", "dup-uuid")
|
||||
|
||||
assert count == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Title-identical Short-circuit TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestTitleIdenticalShortCircuit:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_identical_titles_skip_embedding(self):
|
||||
"""Controls with identical titles in same merge group → direct link."""
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
# Mock the parent link transfer query
|
||||
db.execute.return_value.fetchall.return_value = []
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
master = _make_control("m", reqs=3, hint="implement:mfa:none",
|
||||
title="MFA implementieren")
|
||||
candidate = _make_control("c", reqs=1, hint="implement:mfa:none",
|
||||
title="MFA implementieren")
|
||||
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock) as mock_embed:
|
||||
await runner._check_and_link(master, candidate, "CP-AUTH-001", dry_run=False)
|
||||
|
||||
# Embedding should NOT be called (title-identical short-circuit)
|
||||
mock_embed.assert_not_called()
|
||||
assert runner.stats["linked"] == 1
|
||||
assert runner.stats["skipped_title_identical"] == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cross-Regulation Pass TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCrossRegulationPass:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cross_reg_creates_link(self):
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
# First call: load masters
|
||||
db.execute.return_value.fetchall.return_value = [
|
||||
("uuid-1", "AUTH-001", "MFA implementieren", "CP-AUTH-001",
|
||||
"implement:multi_factor_auth:none"),
|
||||
]
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
cross_result = [{
|
||||
"score": 0.96,
|
||||
"payload": {
|
||||
"control_uuid": "uuid-2",
|
||||
"control_id": "SEC-001",
|
||||
"pattern_id": "CP-SEC-001", # different pattern!
|
||||
},
|
||||
}]
|
||||
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
||||
new_callable=AsyncMock, return_value=cross_result):
|
||||
await runner._run_cross_regulation_pass()
|
||||
|
||||
assert runner.stats["cross_reg_linked"] == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cross_reg_ignores_same_pattern(self):
|
||||
"""Cross-reg should NOT link controls from same pattern."""
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
db.execute.return_value.fetchall.return_value = [
|
||||
("uuid-1", "AUTH-001", "MFA", "CP-AUTH-001", "implement:mfa:none"),
|
||||
]
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
# Match from SAME pattern
|
||||
cross_result = [{
|
||||
"score": 0.97,
|
||||
"payload": {
|
||||
"control_uuid": "uuid-3",
|
||||
"control_id": "AUTH-002",
|
||||
"pattern_id": "CP-AUTH-001", # same pattern
|
||||
},
|
||||
}]
|
||||
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
||||
new_callable=AsyncMock, return_value=cross_result):
|
||||
await runner._run_cross_regulation_pass()
|
||||
|
||||
assert runner.stats["cross_reg_linked"] == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Progress Stats TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestProgressStats:
|
||||
|
||||
def test_get_status(self):
|
||||
db = MagicMock()
|
||||
runner = BatchDedupRunner(db=db)
|
||||
runner.stats["masters"] = 42
|
||||
runner.stats["linked"] = 100
|
||||
runner._progress_pattern = "CP-AUTH-001"
|
||||
runner._progress_count = 500
|
||||
|
||||
status = runner.get_status()
|
||||
assert status["pattern"] == "CP-AUTH-001"
|
||||
assert status["progress"] == 500
|
||||
assert status["masters"] == 42
|
||||
assert status["linked"] == 100
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Route endpoint TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestBatchDedupRoutes:
|
||||
"""Test the batch-dedup API endpoints."""
|
||||
|
||||
def test_status_endpoint_not_running(self):
|
||||
from fastapi import FastAPI
|
||||
from fastapi.testclient import TestClient
|
||||
from compliance.api.crosswalk_routes import router
|
||||
|
||||
app = FastAPI()
|
||||
app.include_router(router, prefix="/api/compliance")
|
||||
client = TestClient(app)
|
||||
|
||||
with patch("compliance.api.crosswalk_routes.SessionLocal") as mock_session:
|
||||
mock_db = MagicMock()
|
||||
mock_session.return_value = mock_db
|
||||
mock_db.execute.return_value.fetchone.return_value = (85000, 0, 85000)
|
||||
|
||||
resp = client.get("/api/compliance/v1/canonical/migrate/batch-dedup/status")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["running"] is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HELPERS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_control(
|
||||
prefix: str,
|
||||
reqs: int = 0,
|
||||
tests: int = 0,
|
||||
evidence: int = 0,
|
||||
hint: str = "",
|
||||
title: str = None,
|
||||
pattern_id: str = "CP-AUTH-001",
|
||||
) -> dict:
|
||||
"""Build a mock control dict for testing."""
|
||||
return {
|
||||
"uuid": f"{prefix}-uuid",
|
||||
"control_id": f"AUTH-{prefix}",
|
||||
"title": title or f"Control {prefix}",
|
||||
"objective": f"Objective for {prefix}",
|
||||
"pattern_id": pattern_id,
|
||||
"requirements": json.dumps([f"r{i}" for i in range(reqs)]),
|
||||
"test_procedure": json.dumps([f"t{i}" for i in range(tests)]),
|
||||
"evidence": json.dumps([f"e{i}" for i in range(evidence)]),
|
||||
"release_state": "draft",
|
||||
"merge_group_hint": hint,
|
||||
"action_object_class": "",
|
||||
}
|
||||
Reference in New Issue
Block a user