fix: adapt batch dedup to NULL pattern_id — group by merge_group_hint
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 31s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s

All Pass 0b controls have pattern_id=NULL. Rewritten to:
- Phase 1: Group by merge_group_hint (action:object:trigger), 52k groups
- Phase 2: Cross-group embedding search for semantically similar masters
- Qdrant search uses unfiltered cross-regulation endpoint
- API param changed: pattern_id → hint_filter

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 07:24:02 +01:00
parent 35784c35eb
commit 770f0b5ab0
3 changed files with 318 additions and 280 deletions

View File

@@ -6,7 +6,7 @@ Covers:
- Master selection (highest quality score wins)
- Duplicate linking (mark + parent-link transfer)
- Dry run mode (no DB changes)
- Cross-regulation pass
- Cross-group pass
- Progress reporting / stats
"""
@@ -147,31 +147,31 @@ class TestMasterSelection:
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
# Mock parent link transfer query
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none")
rich = _make_control("r1", reqs=5, tests=3, evidence=2, hint="implement:mfa:none")
medium = _make_control("m1", reqs=2, tests=1, hint="implement:mfa:none")
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none",
title="MFA implementiert")
rich = _make_control("r1", reqs=5, tests=3, evidence=2,
hint="implement:mfa:none", title="MFA implementiert")
medium = _make_control("m1", reqs=2, tests=1,
hint="implement:mfa:none", title="MFA implementiert")
controls = [sparse, medium, rich]
# Mock embedding to avoid real API calls
# All have same title → all should be title-identical linked
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True), \
patch("compliance.services.batch_dedup_runner.qdrant_search",
new_callable=AsyncMock, return_value=[{
"score": 0.95,
"payload": {"control_uuid": rich["uuid"],
"control_id": rich["control_id"]},
}]):
await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
# Rich should be master (1 master), others linked (2 linked)
assert runner.stats["masters"] == 1
assert runner.stats["linked"] == 2
assert runner.stats["skipped_title_identical"] == 2
# ---------------------------------------------------------------------------
@@ -191,28 +191,19 @@ class TestDryRun:
runner = BatchDedupRunner(db=db)
controls = [
_make_control("a", reqs=3, hint="implement:mfa:none"),
_make_control("b", reqs=1, hint="implement:mfa:none"),
_make_control("a", reqs=3, hint="implement:mfa:none", title="MFA impl"),
_make_control("b", reqs=1, hint="implement:mfa:none", title="MFA impl"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True), \
patch("compliance.services.batch_dedup_runner.qdrant_search",
new_callable=AsyncMock, return_value=[{
"score": 0.95,
"payload": {"control_uuid": "a-uuid",
"control_id": "AUTH-001"},
}]):
await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
# No DB execute calls for UPDATE/INSERT (only the initial load query was mocked)
# In dry_run, _mark_duplicate and _embed_and_index are skipped
assert runner.stats["masters"] == 1
# qdrant_upsert should NOT have been called (dry_run skips indexing)
from compliance.services.batch_dedup_runner import qdrant_upsert
# No commit for dedup operations
assert runner.stats["linked"] == 1
# No commit for dedup operations in dry_run
db.commit.assert_not_called()
@@ -261,56 +252,100 @@ class TestTitleIdenticalShortCircuit:
@pytest.mark.asyncio
async def test_identical_titles_skip_embedding(self):
"""Controls with identical titles in same merge group → direct link."""
"""Controls with identical titles in same hint group → direct link."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
# Mock the parent link transfer query
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
master = _make_control("m", reqs=3, hint="implement:mfa:none",
title="MFA implementieren")
candidate = _make_control("c", reqs=1, hint="implement:mfa:none",
title="MFA implementieren")
controls = [
_make_control("m", reqs=3, hint="implement:mfa:none",
title="MFA implementieren"),
_make_control("c", reqs=1, hint="implement:mfa:none",
title="MFA implementieren"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock) as mock_embed:
await runner._check_and_link(master, candidate, "CP-AUTH-001", dry_run=False)
new_callable=AsyncMock) as mock_embed, \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
# Embedding should NOT be called (title-identical short-circuit)
mock_embed.assert_not_called()
# Embedding should only be called for the master (indexing), not for linking
assert runner.stats["linked"] == 1
assert runner.stats["skipped_title_identical"] == 1
# ---------------------------------------------------------------------------
# Cross-Regulation Pass TESTS
# ---------------------------------------------------------------------------
class TestCrossRegulationPass:
@pytest.mark.asyncio
async def test_cross_reg_creates_link(self):
async def test_different_titles_use_embedding(self):
"""Controls with different titles should use embedding check."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
# First call: load masters
db.execute.return_value.fetchall.return_value = [
("uuid-1", "AUTH-001", "MFA implementieren", "CP-AUTH-001",
db.execute.return_value.fetchall.return_value = []
runner = BatchDedupRunner(db=db)
controls = [
_make_control("m", reqs=3, hint="implement:mfa:none",
title="MFA implementieren fuer Admins"),
_make_control("c", reqs=1, hint="implement:mfa:none",
title="MFA einrichten fuer alle Benutzer"),
]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024) as mock_embed, \
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
new_callable=AsyncMock, return_value=True), \
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
new_callable=AsyncMock, return_value=[]):
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
# Different titles → embedding was called for both (master + candidate)
assert mock_embed.call_count >= 2
# No Qdrant results → linked anyway (same hint = same action+object)
assert runner.stats["linked"] == 1
# ---------------------------------------------------------------------------
# Cross-Group Pass TESTS
# ---------------------------------------------------------------------------
class TestCrossGroupPass:
@pytest.mark.asyncio
async def test_cross_group_creates_link(self):
db = MagicMock()
db.commit = MagicMock()
# First call returns masters, subsequent calls return empty (for transfer)
master_rows = [
("uuid-1", "CTRL-001", "MFA implementieren",
"implement:multi_factor_auth:none"),
]
call_count = {"n": 0}
def mock_execute(stmt, params=None):
result = MagicMock()
call_count["n"] += 1
if call_count["n"] == 1:
result.fetchall.return_value = master_rows
else:
result.fetchall.return_value = []
return result
db.execute = mock_execute
runner = BatchDedupRunner(db=db)
cross_result = [{
"score": 0.96,
"score": 0.95,
"payload": {
"control_uuid": "uuid-2",
"control_id": "SEC-001",
"pattern_id": "CP-SEC-001", # different pattern!
"control_id": "CTRL-002",
"merge_group_hint": "implement:mfa:continuous",
},
}]
@@ -318,39 +353,9 @@ class TestCrossRegulationPass:
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
new_callable=AsyncMock, return_value=cross_result):
await runner._run_cross_regulation_pass()
await runner._run_cross_group_pass()
assert runner.stats["cross_reg_linked"] == 1
@pytest.mark.asyncio
async def test_cross_reg_ignores_same_pattern(self):
"""Cross-reg should NOT link controls from same pattern."""
db = MagicMock()
db.execute = MagicMock()
db.commit = MagicMock()
db.execute.return_value.fetchall.return_value = [
("uuid-1", "AUTH-001", "MFA", "CP-AUTH-001", "implement:mfa:none"),
]
runner = BatchDedupRunner(db=db)
# Match from SAME pattern
cross_result = [{
"score": 0.97,
"payload": {
"control_uuid": "uuid-3",
"control_id": "AUTH-002",
"pattern_id": "CP-AUTH-001", # same pattern
},
}]
with patch("compliance.services.batch_dedup_runner.get_embedding",
new_callable=AsyncMock, return_value=[0.1] * 1024), \
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
new_callable=AsyncMock, return_value=cross_result):
await runner._run_cross_regulation_pass()
assert runner.stats["cross_reg_linked"] == 0
assert runner.stats["cross_group_linked"] == 1
# ---------------------------------------------------------------------------
@@ -365,12 +370,14 @@ class TestProgressStats:
runner = BatchDedupRunner(db=db)
runner.stats["masters"] = 42
runner.stats["linked"] = 100
runner._progress_pattern = "CP-AUTH-001"
runner._progress_phase = "phase1"
runner._progress_count = 500
runner._progress_total = 85000
status = runner.get_status()
assert status["pattern"] == "CP-AUTH-001"
assert status["phase"] == "phase1"
assert status["progress"] == 500
assert status["total"] == 85000
assert status["masters"] == 42
assert status["linked"] == 100
@@ -415,12 +422,12 @@ def _make_control(
evidence: int = 0,
hint: str = "",
title: str = None,
pattern_id: str = "CP-AUTH-001",
pattern_id: str = None,
) -> dict:
"""Build a mock control dict for testing."""
return {
"uuid": f"{prefix}-uuid",
"control_id": f"AUTH-{prefix}",
"control_id": f"CTRL-{prefix}",
"title": title or f"Control {prefix}",
"objective": f"Objective for {prefix}",
"pattern_id": pattern_id,