Fix regulation_filter bypass for chunks without regulation_code

Chunks without a regulation_code were silently passing through the filter
in _scan_rag(), causing unrelated documents (e.g. Data Act, legal templates)
to be included in filtered generation jobs. Now chunks without reg_code are
skipped when regulation_filter is active.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-17 13:38:25 +01:00
parent d22c47c9eb
commit 36ef34169a
2 changed files with 61 additions and 5 deletions

View File

@@ -1022,6 +1022,54 @@ class TestRegulationFilter:
assert "nist_sp_800_218" in codes
assert "amlr" not in codes
@pytest.mark.asyncio
async def test_scan_rag_filters_out_empty_regulation_code(self):
"""Chunks without regulation_code must be skipped when filter is active."""
mock_db = MagicMock()
mock_db.execute.return_value = MagicMock()
mock_db.execute.return_value.__iter__ = MagicMock(return_value=iter([]))
qdrant_points = {
"result": {
"points": [
{"id": "1", "payload": {
"chunk_text": "OWASP ASVS requirement for input validation " * 5,
"regulation_code": "owasp_asvs",
}},
{"id": "2", "payload": {
"chunk_text": "Some template without regulation code at all " * 5,
# No regulation_id, regulation_code, source_id, or source_code
}},
{"id": "3", "payload": {
"chunk_text": "Another chunk with empty regulation code value " * 5,
"regulation_code": "",
}},
],
"next_page_offset": None,
}
}
with patch("compliance.services.control_generator.httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client)
mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False)
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = qdrant_points
mock_client.post.return_value = mock_resp
pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock())
config = GeneratorConfig(
collections=["bp_compliance_ce"],
regulation_filter=["owasp_"],
)
results = await pipeline._scan_rag(config)
# Only the owasp chunk should pass — empty reg_code chunks are filtered out
assert len(results) == 1
assert results[0].regulation_code == "owasp_asvs"
@pytest.mark.asyncio
async def test_scan_rag_no_filter_returns_all(self):
"""Verify _scan_rag returns all chunks when no regulation_filter."""