fix: adapt batch dedup to NULL pattern_id — group by merge_group_hint
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 31s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 31s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s
All Pass 0b controls have pattern_id=NULL. Rewritten to: - Phase 1: Group by merge_group_hint (action:object:trigger), 52k groups - Phase 2: Cross-group embedding search for semantically similar masters - Qdrant search uses unfiltered cross-regulation endpoint - API param changed: pattern_id → hint_filter Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,7 @@ Covers:
|
||||
- Master selection (highest quality score wins)
|
||||
- Duplicate linking (mark + parent-link transfer)
|
||||
- Dry run mode (no DB changes)
|
||||
- Cross-regulation pass
|
||||
- Cross-group pass
|
||||
- Progress reporting / stats
|
||||
"""
|
||||
|
||||
@@ -147,31 +147,31 @@ class TestMasterSelection:
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
# Mock parent link transfer query
|
||||
db.execute.return_value.fetchall.return_value = []
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none")
|
||||
rich = _make_control("r1", reqs=5, tests=3, evidence=2, hint="implement:mfa:none")
|
||||
medium = _make_control("m1", reqs=2, tests=1, hint="implement:mfa:none")
|
||||
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none",
|
||||
title="MFA implementiert")
|
||||
rich = _make_control("r1", reqs=5, tests=3, evidence=2,
|
||||
hint="implement:mfa:none", title="MFA implementiert")
|
||||
medium = _make_control("m1", reqs=2, tests=1,
|
||||
hint="implement:mfa:none", title="MFA implementiert")
|
||||
|
||||
controls = [sparse, medium, rich]
|
||||
|
||||
# Mock embedding to avoid real API calls
|
||||
# All have same title → all should be title-identical linked
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||
new_callable=AsyncMock, return_value=True), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search",
|
||||
new_callable=AsyncMock, return_value=[{
|
||||
"score": 0.95,
|
||||
"payload": {"control_uuid": rich["uuid"],
|
||||
"control_id": rich["control_id"]},
|
||||
}]):
|
||||
await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
|
||||
new_callable=AsyncMock, return_value=True):
|
||||
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
|
||||
|
||||
# Rich should be master (1 master), others linked (2 linked)
|
||||
assert runner.stats["masters"] == 1
|
||||
assert runner.stats["linked"] == 2
|
||||
assert runner.stats["skipped_title_identical"] == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -191,28 +191,19 @@ class TestDryRun:
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
controls = [
|
||||
_make_control("a", reqs=3, hint="implement:mfa:none"),
|
||||
_make_control("b", reqs=1, hint="implement:mfa:none"),
|
||||
_make_control("a", reqs=3, hint="implement:mfa:none", title="MFA impl"),
|
||||
_make_control("b", reqs=1, hint="implement:mfa:none", title="MFA impl"),
|
||||
]
|
||||
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||
new_callable=AsyncMock, return_value=True), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search",
|
||||
new_callable=AsyncMock, return_value=[{
|
||||
"score": 0.95,
|
||||
"payload": {"control_uuid": "a-uuid",
|
||||
"control_id": "AUTH-001"},
|
||||
}]):
|
||||
await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
|
||||
new_callable=AsyncMock, return_value=True):
|
||||
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
|
||||
|
||||
# No DB execute calls for UPDATE/INSERT (only the initial load query was mocked)
|
||||
# In dry_run, _mark_duplicate and _embed_and_index are skipped
|
||||
assert runner.stats["masters"] == 1
|
||||
# qdrant_upsert should NOT have been called (dry_run skips indexing)
|
||||
from compliance.services.batch_dedup_runner import qdrant_upsert
|
||||
# No commit for dedup operations
|
||||
assert runner.stats["linked"] == 1
|
||||
# No commit for dedup operations in dry_run
|
||||
db.commit.assert_not_called()
|
||||
|
||||
|
||||
@@ -261,56 +252,100 @@ class TestTitleIdenticalShortCircuit:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_identical_titles_skip_embedding(self):
|
||||
"""Controls with identical titles in same merge group → direct link."""
|
||||
"""Controls with identical titles in same hint group → direct link."""
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
# Mock the parent link transfer query
|
||||
db.execute.return_value.fetchall.return_value = []
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
master = _make_control("m", reqs=3, hint="implement:mfa:none",
|
||||
title="MFA implementieren")
|
||||
candidate = _make_control("c", reqs=1, hint="implement:mfa:none",
|
||||
title="MFA implementieren")
|
||||
controls = [
|
||||
_make_control("m", reqs=3, hint="implement:mfa:none",
|
||||
title="MFA implementieren"),
|
||||
_make_control("c", reqs=1, hint="implement:mfa:none",
|
||||
title="MFA implementieren"),
|
||||
]
|
||||
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock) as mock_embed:
|
||||
await runner._check_and_link(master, candidate, "CP-AUTH-001", dry_run=False)
|
||||
new_callable=AsyncMock) as mock_embed, \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||
new_callable=AsyncMock, return_value=True):
|
||||
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
|
||||
|
||||
# Embedding should NOT be called (title-identical short-circuit)
|
||||
mock_embed.assert_not_called()
|
||||
# Embedding should only be called for the master (indexing), not for linking
|
||||
assert runner.stats["linked"] == 1
|
||||
assert runner.stats["skipped_title_identical"] == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cross-Regulation Pass TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCrossRegulationPass:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cross_reg_creates_link(self):
|
||||
async def test_different_titles_use_embedding(self):
|
||||
"""Controls with different titles should use embedding check."""
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
# First call: load masters
|
||||
db.execute.return_value.fetchall.return_value = [
|
||||
("uuid-1", "AUTH-001", "MFA implementieren", "CP-AUTH-001",
|
||||
db.execute.return_value.fetchall.return_value = []
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
controls = [
|
||||
_make_control("m", reqs=3, hint="implement:mfa:none",
|
||||
title="MFA implementieren fuer Admins"),
|
||||
_make_control("c", reqs=1, hint="implement:mfa:none",
|
||||
title="MFA einrichten fuer alle Benutzer"),
|
||||
]
|
||||
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024) as mock_embed, \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||
new_callable=AsyncMock, return_value=True), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
||||
new_callable=AsyncMock, return_value=[]):
|
||||
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
|
||||
|
||||
# Different titles → embedding was called for both (master + candidate)
|
||||
assert mock_embed.call_count >= 2
|
||||
# No Qdrant results → linked anyway (same hint = same action+object)
|
||||
assert runner.stats["linked"] == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cross-Group Pass TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCrossGroupPass:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cross_group_creates_link(self):
|
||||
db = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
|
||||
# First call returns masters, subsequent calls return empty (for transfer)
|
||||
master_rows = [
|
||||
("uuid-1", "CTRL-001", "MFA implementieren",
|
||||
"implement:multi_factor_auth:none"),
|
||||
]
|
||||
call_count = {"n": 0}
|
||||
|
||||
def mock_execute(stmt, params=None):
|
||||
result = MagicMock()
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
result.fetchall.return_value = master_rows
|
||||
else:
|
||||
result.fetchall.return_value = []
|
||||
return result
|
||||
|
||||
db.execute = mock_execute
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
cross_result = [{
|
||||
"score": 0.96,
|
||||
"score": 0.95,
|
||||
"payload": {
|
||||
"control_uuid": "uuid-2",
|
||||
"control_id": "SEC-001",
|
||||
"pattern_id": "CP-SEC-001", # different pattern!
|
||||
"control_id": "CTRL-002",
|
||||
"merge_group_hint": "implement:mfa:continuous",
|
||||
},
|
||||
}]
|
||||
|
||||
@@ -318,39 +353,9 @@ class TestCrossRegulationPass:
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
||||
new_callable=AsyncMock, return_value=cross_result):
|
||||
await runner._run_cross_regulation_pass()
|
||||
await runner._run_cross_group_pass()
|
||||
|
||||
assert runner.stats["cross_reg_linked"] == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cross_reg_ignores_same_pattern(self):
|
||||
"""Cross-reg should NOT link controls from same pattern."""
|
||||
db = MagicMock()
|
||||
db.execute = MagicMock()
|
||||
db.commit = MagicMock()
|
||||
db.execute.return_value.fetchall.return_value = [
|
||||
("uuid-1", "AUTH-001", "MFA", "CP-AUTH-001", "implement:mfa:none"),
|
||||
]
|
||||
|
||||
runner = BatchDedupRunner(db=db)
|
||||
|
||||
# Match from SAME pattern
|
||||
cross_result = [{
|
||||
"score": 0.97,
|
||||
"payload": {
|
||||
"control_uuid": "uuid-3",
|
||||
"control_id": "AUTH-002",
|
||||
"pattern_id": "CP-AUTH-001", # same pattern
|
||||
},
|
||||
}]
|
||||
|
||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
||||
new_callable=AsyncMock, return_value=cross_result):
|
||||
await runner._run_cross_regulation_pass()
|
||||
|
||||
assert runner.stats["cross_reg_linked"] == 0
|
||||
assert runner.stats["cross_group_linked"] == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -365,12 +370,14 @@ class TestProgressStats:
|
||||
runner = BatchDedupRunner(db=db)
|
||||
runner.stats["masters"] = 42
|
||||
runner.stats["linked"] = 100
|
||||
runner._progress_pattern = "CP-AUTH-001"
|
||||
runner._progress_phase = "phase1"
|
||||
runner._progress_count = 500
|
||||
runner._progress_total = 85000
|
||||
|
||||
status = runner.get_status()
|
||||
assert status["pattern"] == "CP-AUTH-001"
|
||||
assert status["phase"] == "phase1"
|
||||
assert status["progress"] == 500
|
||||
assert status["total"] == 85000
|
||||
assert status["masters"] == 42
|
||||
assert status["linked"] == 100
|
||||
|
||||
@@ -415,12 +422,12 @@ def _make_control(
|
||||
evidence: int = 0,
|
||||
hint: str = "",
|
||||
title: str = None,
|
||||
pattern_id: str = "CP-AUTH-001",
|
||||
pattern_id: str = None,
|
||||
) -> dict:
|
||||
"""Build a mock control dict for testing."""
|
||||
return {
|
||||
"uuid": f"{prefix}-uuid",
|
||||
"control_id": f"AUTH-{prefix}",
|
||||
"control_id": f"CTRL-{prefix}",
|
||||
"title": title or f"Control {prefix}",
|
||||
"objective": f"Objective for {prefix}",
|
||||
"pattern_id": pattern_id,
|
||||
|
||||
Reference in New Issue
Block a user