Files
breakpilot-compliance/backend-compliance/tests/test_cookie_sufficiency_judge.py
T
Benjamin Admin 3e3644f83d feat(checkers): platform router + Haiku sufficiency tier; cookie is first consumer
Generalise "Embedding finds, Claude decides" into the shared Pruefer-Library:
- router.route_and_check dispatches control -> sensor_classification -> Checker.
- build_spec reads sensor_classification (CONTENT/LLM -> judge=haiku, the
  validated sufficiency tier; the Qwen-first cascade is disproven for sufficiency).
- LLMChecker gains a Haiku-direct tier (reuses the validated deep_check prompt).
- Cookie Layer-3 now routes through route_and_check instead of bespoke code, so
  cookie is the first real router consumer -- proves the architecture end-to-end.

Reproduces the validated result via the shared path: FN 159->14, recall
0.13->0.92, precision 0.89 (vs bespoke 12/0.93/0.90 -- within Haiku noise).
Tests: 10/10 (router dispatch + build_spec + haiku tier + cookie rewire).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-22 17:37:48 +02:00

69 lines
2.4 KiB
Python

"""Layer-3 cookie sufficiency-judge: only embedding/boost-RESCUED passes are
re-judged by Haiku; keyword passes are untouched; a FEHLT verdict un-passes."""
import pytest
from unittest.mock import AsyncMock, patch
from compliance.services.specialist_agents.cookie_policy._sufficiency_judge import (
judge_rescued,
)
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
_DOC = "Volltext der Cookie-Richtlinie mit ausreichend Inhalt. " * 4
def _r(cid, source, passed=True):
return {"control_id": cid, "source": source, "passed": passed,
"label": cid, "_pass_criteria": ["konkrete Angabe nötig"]}
@pytest.mark.asyncio
async def test_rescued_unpassed_when_judge_fehlt():
results = [_r("A", "keyword+embedding")]
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
with patch(_ANTHROPIC, new=fake):
n = await judge_rescued(_DOC, results)
assert n == 1
assert results[0]["passed"] is False
assert "+llm_failed" in results[0]["source"]
@pytest.mark.asyncio
async def test_rescued_kept_when_judge_erfuellt():
results = [_r("A", "keyword+embedding")]
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}')
with patch(_ANTHROPIC, new=fake):
n = await judge_rescued(_DOC, results)
assert n == 0
assert results[0]["passed"] is True
@pytest.mark.asyncio
async def test_keyword_pass_not_judged():
"""Deterministisch (keyword) bestandene Controls werden NICHT befragt."""
results = [_r("A", "keyword")]
fake = AsyncMock(return_value='{"erfuellt": false}')
with patch(_ANTHROPIC, new=fake):
n = await judge_rescued(_DOC, results)
assert n == 0
assert results[0]["passed"] is True
assert fake.call_count == 0
@pytest.mark.asyncio
async def test_boost_rescue_is_judged():
results = [_r("A", "keyword+regex_boost")]
fake = AsyncMock(return_value='{"erfuellt": false}')
with patch(_ANTHROPIC, new=fake):
n = await judge_rescued(_DOC, results)
assert n == 1 and results[0]["passed"] is False
@pytest.mark.asyncio
async def test_failed_controls_ignored():
"""Nicht-bestandene (failed) Controls sind nicht Sache dieser Schicht."""
results = [_r("A", "keyword+embedding", passed=False)]
fake = AsyncMock(return_value='{"erfuellt": false}')
with patch(_ANTHROPIC, new=fake):
n = await judge_rescued(_DOC, results)
assert n == 0 and fake.call_count == 0