feat(pipeline): Anthropic Batch API, source/regulation filter, cost optimization
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 35s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 35s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
- Add Anthropic API support to decomposition Pass 0a/0b (prompt caching, content batching) - Add Anthropic Batch API (50% cost reduction, async 24h processing) - Add source_filter (ILIKE on source_citation) for regulation-based filtering - Add category_filter to Pass 0a for selective decomposition - Add regulation_filter to control_generator for RAG scan phase filtering (prefix match on regulation_code — enables CE + Code Review focus) - New API endpoints: batch-submit-0a, batch-submit-0b, batch-status, batch-process - 83 new tests (all passing) Cost reduction: $2,525 → ~$600-700 with all optimizations combined. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -947,3 +947,120 @@ class TestBatchProcessingLoop:
|
||||
assert len(result) == 1
|
||||
assert result[0].release_state == "too_close"
|
||||
assert result[0].generation_metadata["similarity_status"] == "FAIL"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Regulation Filter Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestRegulationFilter:
|
||||
"""Tests for regulation_filter in GeneratorConfig."""
|
||||
|
||||
def test_config_accepts_regulation_filter(self):
|
||||
config = GeneratorConfig(regulation_filter=["owasp_", "nist_", "eu_2023_1230"])
|
||||
assert config.regulation_filter == ["owasp_", "nist_", "eu_2023_1230"]
|
||||
|
||||
def test_config_default_none(self):
|
||||
config = GeneratorConfig()
|
||||
assert config.regulation_filter is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scan_rag_filters_by_regulation(self):
|
||||
"""Verify _scan_rag skips chunks not matching regulation_filter."""
|
||||
mock_db = MagicMock()
|
||||
mock_db.execute.return_value.fetchall.return_value = []
|
||||
mock_db.execute.return_value = MagicMock()
|
||||
mock_db.execute.return_value.__iter__ = MagicMock(return_value=iter([]))
|
||||
|
||||
# Mock Qdrant scroll response with mixed regulation_codes
|
||||
qdrant_points = {
|
||||
"result": {
|
||||
"points": [
|
||||
{"id": "1", "payload": {
|
||||
"chunk_text": "OWASP ASVS requirement for input validation " * 5,
|
||||
"regulation_code": "owasp_asvs",
|
||||
"regulation_name": "OWASP ASVS",
|
||||
}},
|
||||
{"id": "2", "payload": {
|
||||
"chunk_text": "AML anti-money laundering requirement for banks " * 5,
|
||||
"regulation_code": "amlr",
|
||||
"regulation_name": "AML-Verordnung",
|
||||
}},
|
||||
{"id": "3", "payload": {
|
||||
"chunk_text": "NIST secure software development framework req " * 5,
|
||||
"regulation_code": "nist_sp_800_218",
|
||||
"regulation_name": "NIST SSDF",
|
||||
}},
|
||||
],
|
||||
"next_page_offset": None,
|
||||
}
|
||||
}
|
||||
|
||||
with patch("compliance.services.control_generator.httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = qdrant_points
|
||||
mock_client.post.return_value = mock_resp
|
||||
|
||||
pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock())
|
||||
|
||||
# With filter: only owasp_ and nist_ prefixes
|
||||
config = GeneratorConfig(
|
||||
collections=["bp_compliance_ce"],
|
||||
regulation_filter=["owasp_", "nist_"],
|
||||
)
|
||||
results = await pipeline._scan_rag(config)
|
||||
|
||||
# Should only get 2 chunks (owasp + nist), not amlr
|
||||
assert len(results) == 2
|
||||
codes = {r.regulation_code for r in results}
|
||||
assert "owasp_asvs" in codes
|
||||
assert "nist_sp_800_218" in codes
|
||||
assert "amlr" not in codes
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scan_rag_no_filter_returns_all(self):
|
||||
"""Verify _scan_rag returns all chunks when no regulation_filter."""
|
||||
mock_db = MagicMock()
|
||||
mock_db.execute.return_value.fetchall.return_value = []
|
||||
mock_db.execute.return_value = MagicMock()
|
||||
mock_db.execute.return_value.__iter__ = MagicMock(return_value=iter([]))
|
||||
|
||||
qdrant_points = {
|
||||
"result": {
|
||||
"points": [
|
||||
{"id": "1", "payload": {
|
||||
"chunk_text": "OWASP requirement for secure authentication " * 5,
|
||||
"regulation_code": "owasp_asvs",
|
||||
}},
|
||||
{"id": "2", "payload": {
|
||||
"chunk_text": "AML compliance requirement for financial inst " * 5,
|
||||
"regulation_code": "amlr",
|
||||
}},
|
||||
],
|
||||
"next_page_offset": None,
|
||||
}
|
||||
}
|
||||
|
||||
with patch("compliance.services.control_generator.httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = qdrant_points
|
||||
mock_client.post.return_value = mock_resp
|
||||
|
||||
pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock())
|
||||
config = GeneratorConfig(
|
||||
collections=["bp_compliance_ce"],
|
||||
regulation_filter=None,
|
||||
)
|
||||
results = await pipeline._scan_rag(config)
|
||||
|
||||
assert len(results) == 2
|
||||
|
||||
Reference in New Issue
Block a user