e809d0bc1c
The embedding/boost auto-rescue is intentionally optimistic (finds the topic, not fulfilment) -> 159 FN over-rescues vs Opus-GT (recall 0.13). Layer-3 re-judges exactly the rescued passes with the validated Haiku judge (cohort cookie_sufficiency_v1 P0.89/R0.91) -- NOT the Qwen-first cascade (local is disproven as a sufficiency judge) -- and un-passes them when the obligation is not concretely met. Gated to the full check (not skip_llm). Measured (5-firm Opus-GT, engine+L3): FN 159->12, recall 0.13->0.93, precision 0.96->0.90 (276 rescues corrected). "Embedding finds, Claude decides." Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
68 lines
2.4 KiB
Python
68 lines
2.4 KiB
Python
"""Layer-3 cookie sufficiency-judge: only embedding/boost-RESCUED passes are
|
|
re-judged by Haiku; keyword passes are untouched; a FEHLT verdict un-passes."""
|
|
import pytest
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
from compliance.services.specialist_agents.cookie_policy._sufficiency_judge import (
|
|
judge_rescued,
|
|
)
|
|
|
|
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
|
|
|
|
|
|
def _r(cid, source, passed=True):
|
|
return {"control_id": cid, "source": source, "passed": passed,
|
|
"label": cid, "_pass_criteria": ["konkrete Angabe nötig"]}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rescued_unpassed_when_judge_fehlt():
|
|
results = [_r("A", "keyword+embedding")]
|
|
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
|
|
with patch(_ANTHROPIC, new=fake):
|
|
n = await judge_rescued("text", results)
|
|
assert n == 1
|
|
assert results[0]["passed"] is False
|
|
assert "+llm_failed" in results[0]["source"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rescued_kept_when_judge_erfuellt():
|
|
results = [_r("A", "keyword+embedding")]
|
|
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}')
|
|
with patch(_ANTHROPIC, new=fake):
|
|
n = await judge_rescued("text", results)
|
|
assert n == 0
|
|
assert results[0]["passed"] is True
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_keyword_pass_not_judged():
|
|
"""Deterministisch (keyword) bestandene Controls werden NICHT befragt."""
|
|
results = [_r("A", "keyword")]
|
|
fake = AsyncMock(return_value='{"erfuellt": false}')
|
|
with patch(_ANTHROPIC, new=fake):
|
|
n = await judge_rescued("text", results)
|
|
assert n == 0
|
|
assert results[0]["passed"] is True
|
|
assert fake.call_count == 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_boost_rescue_is_judged():
|
|
results = [_r("A", "keyword+regex_boost")]
|
|
fake = AsyncMock(return_value='{"erfuellt": false}')
|
|
with patch(_ANTHROPIC, new=fake):
|
|
n = await judge_rescued("text", results)
|
|
assert n == 1 and results[0]["passed"] is False
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_failed_controls_ignored():
|
|
"""Nicht-bestandene (failed) Controls sind nicht Sache dieser Schicht."""
|
|
results = [_r("A", "keyword+embedding", passed=False)]
|
|
fake = AsyncMock(return_value='{"erfuellt": false}')
|
|
with patch(_ANTHROPIC, new=fake):
|
|
n = await judge_rescued("text", results)
|
|
assert n == 0 and fake.call_count == 0
|