feat(cookie): Layer-3 sufficiency-judge — Haiku re-judges embedding/boost rescues
The embedding/boost auto-rescue is intentionally optimistic (finds the topic, not fulfilment) -> 159 FN over-rescues vs Opus-GT (recall 0.13). Layer-3 re-judges exactly the rescued passes with the validated Haiku judge (cohort cookie_sufficiency_v1 P0.89/R0.91) -- NOT the Qwen-first cascade (local is disproven as a sufficiency judge) -- and un-passes them when the obligation is not concretely met. Gated to the full check (not skip_llm). Measured (5-firm Opus-GT, engine+L3): FN 159->12, recall 0.13->0.93, precision 0.96->0.90 (276 rescues corrected). "Embedding finds, Claude decides." Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+65
@@ -0,0 +1,65 @@
|
||||
"""Layer-3 Sufficiency-Judge fuer Cookie-Policy.
|
||||
|
||||
Das Embedding/Boost-Auto-Rescue (Layer 0/2) ist BEWUSST optimistisch — es findet
|
||||
das Thema, beweist aber nicht die Erfuellung. Messung (2026-06-22): 159 FN
|
||||
(Over-Rescue) gegen Opus-GT, weil 'Thema erwaehnt' als 'erfuellt' durchgewunken
|
||||
wurde. Diese Schicht prueft GENAU die rescued Controls mit dem validierten
|
||||
Haiku-Judge (Cohort cookie_sufficiency_v1: P0.89/R0.91) — NICHT die Qwen-first-
|
||||
Kaskade (lokal ist als Sufficiency-Judge widerlegt) — und nimmt 'passed' zurueck,
|
||||
wenn die konkrete Pflicht nicht erfuellt ist. 'Embedding findet, Claude entscheidet.'
|
||||
|
||||
Nur fuer den NICHT-skip_llm-Pfad (voller Check); der schnelle/interaktive Pfad
|
||||
behaelt das deterministische Rescue.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_RESCUE_MARKERS = ("+embedding", "+regex_boost")
|
||||
|
||||
|
||||
def _is_rescued(r: dict[str, Any]) -> bool:
|
||||
src = r.get("source") or ""
|
||||
return r.get("passed") and any(m in src for m in _RESCUE_MARKERS)
|
||||
|
||||
|
||||
async def judge_rescued(text: str, results: list[dict[str, Any]]) -> int:
|
||||
"""Prueft alle rescued (embedding/boost) passed-Controls mit Haiku.
|
||||
Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht.
|
||||
Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck.
|
||||
"""
|
||||
from compliance.services.llm_cascade import _call_anthropic
|
||||
from compliance.services.specialist_agents.dse.deep_check import (
|
||||
_JUDGE_SYS, _build_user, _parse,
|
||||
)
|
||||
|
||||
candidates = [r for r in results if _is_rescued(r)]
|
||||
if not candidates:
|
||||
return 0
|
||||
corrected = 0
|
||||
for r in candidates:
|
||||
crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""]
|
||||
if not isinstance(crit, list):
|
||||
crit = [str(crit)]
|
||||
title = r.get("label") or r.get("hint") or r.get("control_id") or ""
|
||||
user = _build_user(text, title, crit)
|
||||
verdict = None
|
||||
for _ in range(2): # retry on transient/malformed
|
||||
p = _parse(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400))
|
||||
if p:
|
||||
verdict = p
|
||||
break
|
||||
if verdict is not None and verdict.get("erfuellt") is False:
|
||||
r["passed"] = False
|
||||
r["source"] = (r.get("source") or "") + "+llm_failed"
|
||||
r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]"
|
||||
r["_judge_reason"] = (verdict.get("begruendung") or "")[:200]
|
||||
corrected += 1
|
||||
if corrected:
|
||||
logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen",
|
||||
corrected, len(candidates))
|
||||
return corrected
|
||||
@@ -96,6 +96,22 @@ class CookiePolicyAgent(BaseSpecialistAgent):
|
||||
"Branchen-MCs entfernt"
|
||||
)
|
||||
|
||||
# Layer 3 — Sufficiency-Judge (Haiku) auf die embedding/boost-rescued
|
||||
# Controls: Embedding findet das Thema, Claude entscheidet ob die Pflicht
|
||||
# konkret erfuellt ist. Nur im vollen Check (nicht skip_llm).
|
||||
skip_llm = bool((agent_input.context or {}).get("skip_llm"))
|
||||
if not skip_llm:
|
||||
try:
|
||||
from ._sufficiency_judge import judge_rescued
|
||||
corrected = await judge_rescued(text, results)
|
||||
if corrected:
|
||||
notes_parts.append(
|
||||
f"layer-3 sufficiency-judge: {corrected} Rescues "
|
||||
"zurückgenommen"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("cookie layer-3 judge skipped: %s", e)
|
||||
|
||||
seen: set[str] = set()
|
||||
for r in results:
|
||||
mc_id = r.get("control_id") or ""
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
"""Layer-3 cookie sufficiency-judge: only embedding/boost-RESCUED passes are
|
||||
re-judged by Haiku; keyword passes are untouched; a FEHLT verdict un-passes."""
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
from compliance.services.specialist_agents.cookie_policy._sufficiency_judge import (
|
||||
judge_rescued,
|
||||
)
|
||||
|
||||
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
|
||||
|
||||
|
||||
def _r(cid, source, passed=True):
|
||||
return {"control_id": cid, "source": source, "passed": passed,
|
||||
"label": cid, "_pass_criteria": ["konkrete Angabe nötig"]}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rescued_unpassed_when_judge_fehlt():
|
||||
results = [_r("A", "keyword+embedding")]
|
||||
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
|
||||
with patch(_ANTHROPIC, new=fake):
|
||||
n = await judge_rescued("text", results)
|
||||
assert n == 1
|
||||
assert results[0]["passed"] is False
|
||||
assert "+llm_failed" in results[0]["source"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rescued_kept_when_judge_erfuellt():
|
||||
results = [_r("A", "keyword+embedding")]
|
||||
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}')
|
||||
with patch(_ANTHROPIC, new=fake):
|
||||
n = await judge_rescued("text", results)
|
||||
assert n == 0
|
||||
assert results[0]["passed"] is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_keyword_pass_not_judged():
|
||||
"""Deterministisch (keyword) bestandene Controls werden NICHT befragt."""
|
||||
results = [_r("A", "keyword")]
|
||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
||||
with patch(_ANTHROPIC, new=fake):
|
||||
n = await judge_rescued("text", results)
|
||||
assert n == 0
|
||||
assert results[0]["passed"] is True
|
||||
assert fake.call_count == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_boost_rescue_is_judged():
|
||||
results = [_r("A", "keyword+regex_boost")]
|
||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
||||
with patch(_ANTHROPIC, new=fake):
|
||||
n = await judge_rescued("text", results)
|
||||
assert n == 1 and results[0]["passed"] is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_failed_controls_ignored():
|
||||
"""Nicht-bestandene (failed) Controls sind nicht Sache dieser Schicht."""
|
||||
results = [_r("A", "keyword+embedding", passed=False)]
|
||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
||||
with patch(_ANTHROPIC, new=fake):
|
||||
n = await judge_rescued("text", results)
|
||||
assert n == 0 and fake.call_count == 0
|
||||
Reference in New Issue
Block a user