From e809d0bc1c598c1dcd238f8917669b28d4b821ef Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 22 Jun 2026 10:07:53 +0200 Subject: [PATCH] =?UTF-8?q?feat(cookie):=20Layer-3=20sufficiency-judge=20?= =?UTF-8?q?=E2=80=94=20Haiku=20re-judges=20embedding/boost=20rescues?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The embedding/boost auto-rescue is intentionally optimistic (finds the topic, not fulfilment) -> 159 FN over-rescues vs Opus-GT (recall 0.13). Layer-3 re-judges exactly the rescued passes with the validated Haiku judge (cohort cookie_sufficiency_v1 P0.89/R0.91) -- NOT the Qwen-first cascade (local is disproven as a sufficiency judge) -- and un-passes them when the obligation is not concretely met. Gated to the full check (not skip_llm). Measured (5-firm Opus-GT, engine+L3): FN 159->12, recall 0.13->0.93, precision 0.96->0.90 (276 rescues corrected). "Embedding finds, Claude decides." Co-Authored-By: Claude Opus 4.7 --- .../cookie_policy/_sufficiency_judge.py | 65 ++++++++++++++++++ .../specialist_agents/cookie_policy/agent.py | 16 +++++ .../tests/test_cookie_sufficiency_judge.py | 67 +++++++++++++++++++ 3 files changed, 148 insertions(+) create mode 100644 backend-compliance/compliance/services/specialist_agents/cookie_policy/_sufficiency_judge.py create mode 100644 backend-compliance/tests/test_cookie_sufficiency_judge.py diff --git a/backend-compliance/compliance/services/specialist_agents/cookie_policy/_sufficiency_judge.py b/backend-compliance/compliance/services/specialist_agents/cookie_policy/_sufficiency_judge.py new file mode 100644 index 00000000..3a26d566 --- /dev/null +++ b/backend-compliance/compliance/services/specialist_agents/cookie_policy/_sufficiency_judge.py @@ -0,0 +1,65 @@ +"""Layer-3 Sufficiency-Judge fuer Cookie-Policy. + +Das Embedding/Boost-Auto-Rescue (Layer 0/2) ist BEWUSST optimistisch — es findet +das Thema, beweist aber nicht die Erfuellung. Messung (2026-06-22): 159 FN +(Over-Rescue) gegen Opus-GT, weil 'Thema erwaehnt' als 'erfuellt' durchgewunken +wurde. Diese Schicht prueft GENAU die rescued Controls mit dem validierten +Haiku-Judge (Cohort cookie_sufficiency_v1: P0.89/R0.91) — NICHT die Qwen-first- +Kaskade (lokal ist als Sufficiency-Judge widerlegt) — und nimmt 'passed' zurueck, +wenn die konkrete Pflicht nicht erfuellt ist. 'Embedding findet, Claude entscheidet.' + +Nur fuer den NICHT-skip_llm-Pfad (voller Check); der schnelle/interaktive Pfad +behaelt das deterministische Rescue. +""" + +from __future__ import annotations + +import logging +from typing import Any + +logger = logging.getLogger(__name__) + +_RESCUE_MARKERS = ("+embedding", "+regex_boost") + + +def _is_rescued(r: dict[str, Any]) -> bool: + src = r.get("source") or "" + return r.get("passed") and any(m in src for m in _RESCUE_MARKERS) + + +async def judge_rescued(text: str, results: list[dict[str, Any]]) -> int: + """Prueft alle rescued (embedding/boost) passed-Controls mit Haiku. + Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht. + Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck. + """ + from compliance.services.llm_cascade import _call_anthropic + from compliance.services.specialist_agents.dse.deep_check import ( + _JUDGE_SYS, _build_user, _parse, + ) + + candidates = [r for r in results if _is_rescued(r)] + if not candidates: + return 0 + corrected = 0 + for r in candidates: + crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""] + if not isinstance(crit, list): + crit = [str(crit)] + title = r.get("label") or r.get("hint") or r.get("control_id") or "" + user = _build_user(text, title, crit) + verdict = None + for _ in range(2): # retry on transient/malformed + p = _parse(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400)) + if p: + verdict = p + break + if verdict is not None and verdict.get("erfuellt") is False: + r["passed"] = False + r["source"] = (r.get("source") or "") + "+llm_failed" + r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]" + r["_judge_reason"] = (verdict.get("begruendung") or "")[:200] + corrected += 1 + if corrected: + logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen", + corrected, len(candidates)) + return corrected diff --git a/backend-compliance/compliance/services/specialist_agents/cookie_policy/agent.py b/backend-compliance/compliance/services/specialist_agents/cookie_policy/agent.py index 3606de7b..b995f245 100644 --- a/backend-compliance/compliance/services/specialist_agents/cookie_policy/agent.py +++ b/backend-compliance/compliance/services/specialist_agents/cookie_policy/agent.py @@ -96,6 +96,22 @@ class CookiePolicyAgent(BaseSpecialistAgent): "Branchen-MCs entfernt" ) + # Layer 3 — Sufficiency-Judge (Haiku) auf die embedding/boost-rescued + # Controls: Embedding findet das Thema, Claude entscheidet ob die Pflicht + # konkret erfuellt ist. Nur im vollen Check (nicht skip_llm). + skip_llm = bool((agent_input.context or {}).get("skip_llm")) + if not skip_llm: + try: + from ._sufficiency_judge import judge_rescued + corrected = await judge_rescued(text, results) + if corrected: + notes_parts.append( + f"layer-3 sufficiency-judge: {corrected} Rescues " + "zurückgenommen" + ) + except Exception as e: + logger.warning("cookie layer-3 judge skipped: %s", e) + seen: set[str] = set() for r in results: mc_id = r.get("control_id") or "" diff --git a/backend-compliance/tests/test_cookie_sufficiency_judge.py b/backend-compliance/tests/test_cookie_sufficiency_judge.py new file mode 100644 index 00000000..95a00877 --- /dev/null +++ b/backend-compliance/tests/test_cookie_sufficiency_judge.py @@ -0,0 +1,67 @@ +"""Layer-3 cookie sufficiency-judge: only embedding/boost-RESCUED passes are +re-judged by Haiku; keyword passes are untouched; a FEHLT verdict un-passes.""" +import pytest +from unittest.mock import AsyncMock, patch + +from compliance.services.specialist_agents.cookie_policy._sufficiency_judge import ( + judge_rescued, +) + +_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic" + + +def _r(cid, source, passed=True): + return {"control_id": cid, "source": source, "passed": passed, + "label": cid, "_pass_criteria": ["konkrete Angabe nötig"]} + + +@pytest.mark.asyncio +async def test_rescued_unpassed_when_judge_fehlt(): + results = [_r("A", "keyword+embedding")] + fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}') + with patch(_ANTHROPIC, new=fake): + n = await judge_rescued("text", results) + assert n == 1 + assert results[0]["passed"] is False + assert "+llm_failed" in results[0]["source"] + + +@pytest.mark.asyncio +async def test_rescued_kept_when_judge_erfuellt(): + results = [_r("A", "keyword+embedding")] + fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}') + with patch(_ANTHROPIC, new=fake): + n = await judge_rescued("text", results) + assert n == 0 + assert results[0]["passed"] is True + + +@pytest.mark.asyncio +async def test_keyword_pass_not_judged(): + """Deterministisch (keyword) bestandene Controls werden NICHT befragt.""" + results = [_r("A", "keyword")] + fake = AsyncMock(return_value='{"erfuellt": false}') + with patch(_ANTHROPIC, new=fake): + n = await judge_rescued("text", results) + assert n == 0 + assert results[0]["passed"] is True + assert fake.call_count == 0 + + +@pytest.mark.asyncio +async def test_boost_rescue_is_judged(): + results = [_r("A", "keyword+regex_boost")] + fake = AsyncMock(return_value='{"erfuellt": false}') + with patch(_ANTHROPIC, new=fake): + n = await judge_rescued("text", results) + assert n == 1 and results[0]["passed"] is False + + +@pytest.mark.asyncio +async def test_failed_controls_ignored(): + """Nicht-bestandene (failed) Controls sind nicht Sache dieser Schicht.""" + results = [_r("A", "keyword+embedding", passed=False)] + fake = AsyncMock(return_value='{"erfuellt": false}') + with patch(_ANTHROPIC, new=fake): + n = await judge_rescued("text", results) + assert n == 0 and fake.call_count == 0