feat(checkers): platform router + Haiku sufficiency tier; cookie is first consumer

Generalise "Embedding finds, Claude decides" into the shared Pruefer-Library:
- router.route_and_check dispatches control -> sensor_classification -> Checker.
- build_spec reads sensor_classification (CONTENT/LLM -> judge=haiku, the
  validated sufficiency tier; the Qwen-first cascade is disproven for sufficiency).
- LLMChecker gains a Haiku-direct tier (reuses the validated deep_check prompt).
- Cookie Layer-3 now routes through route_and_check instead of bespoke code, so
  cookie is the first real router consumer -- proves the architecture end-to-end.

Reproduces the validated result via the shared path: FN 159->14, recall
0.13->0.92, precision 0.89 (vs bespoke 12/0.93/0.90 -- within Haiku noise).
Tests: 10/10 (router dispatch + build_spec + haiku tier + cookie rewire).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-22 12:56:14 +02:00
parent e809d0bc1c
commit 3e3644f83d
5 changed files with 170 additions and 19 deletions
@@ -45,6 +45,11 @@ class LLMChecker:
text = doc.text or "" text = doc.text or ""
if len(text) < 50: if len(text) < 50:
return CheckResult(present=None, source="llm") return CheckResult(present=None, source="llm")
# decision_method=LLM mit judge='haiku': Sufficiency-Pfad (validiert
# P0.89/R0.91). Der Qwen-first-Cascade ist als Sufficiency-Judge
# widerlegt -> hier Haiku direkt, kriteriengeführte Subsumtion.
if (ctrl.extra or {}).get("judge") == "haiku":
return await self._haiku(ctrl, text)
secs = _sections(text) secs = _sections(text)
if ctrl.topic_regex: if ctrl.topic_regex:
rel = [s for s in secs if re.search(ctrl.topic_regex, s, re.I)][:6] or secs[:6] rel = [s for s in secs if re.search(ctrl.topic_regex, s, re.I)][:6] or secs[:6]
@@ -71,3 +76,31 @@ class LLMChecker:
except Exception as e: except Exception as e:
logger.info("llm checker fail %s: %s", ctrl.control_id, str(e)[:80]) logger.info("llm checker fail %s: %s", ctrl.control_id, str(e)[:80])
return CheckResult(present=None, source="error") return CheckResult(present=None, source="error")
async def _haiku(self, ctrl: ControlSpec, text: str) -> CheckResult:
"""Sufficiency via Haiku direkt (validierter Judge). Kriteriengeführt:
die Rechts-Elemente stehen in ctrl.paraphrases; wiederverwendet den
validierten deep_check-Sufficiency-Prompt."""
try:
from compliance.services.llm_cascade import _call_anthropic
from compliance.services.specialist_agents.dse.deep_check import (
_JUDGE_SYS, _build_user, _parse as _parse_judge,
)
crit = ctrl.paraphrases or [ctrl.label or ctrl.control_id]
user = _build_user(text, ctrl.label or ctrl.control_id, crit)
obj = None
for _ in range(2):
obj = _parse_judge(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400))
if obj:
break
if not obj:
return CheckResult(present=None, source="haiku")
return CheckResult(
present=bool(obj.get("erfuellt")),
evidence=(obj.get("begruendung") or "")[:120],
confidence=float(obj.get("confidence") or 0.0),
source="haiku",
)
except Exception as e:
logger.info("llm haiku checker fail %s: %s", ctrl.control_id, str(e)[:80])
return CheckResult(present=None, source="error")
@@ -0,0 +1,68 @@
"""Prüfer-Router — method-agnostischer Dispatch.
control → sensor_classification (verification_method + decision_method) → Checker.
Ein neues Modul liefert nur ControlSpecs; der Router wählt den Prüfer. Damit wird
der „Embedding findet, Claude entscheidet"-Pfad EIN gemeinsamer CONTENT/LLM-Prüfer
statt Cookie-Sonderlogik. Nicht-gebaute Prüfer (PLAYWRIGHT/AUDIT/SCANNER/REGEX-
FIELD) → present=None (fail-safe: Aufrufer behält sein deterministisches Ergebnis).
"""
from __future__ import annotations
from typing import Any, Optional
from .base import CheckResult, ControlSpec, DecisionMethod, DocContext
from .embedding_checker import EmbeddingChecker
from .llm_checker import LLMChecker
from .reference_checker import ReferenceChecker
_LLM = LLMChecker()
_EMB = EmbeddingChecker()
_REF = ReferenceChecker()
# decision_method → Checker. Fehlende Mechanismen bewusst None (noch nicht gebaut).
_BY_DECISION: dict[str, Any] = {
DecisionMethod.LLM: _LLM,
DecisionMethod.EMBEDDING: _EMB,
DecisionMethod.LINK_RESOLVER: _REF,
}
async def route_and_check(ctrl: ControlSpec, doc: DocContext) -> CheckResult:
checker = _BY_DECISION.get((ctrl.decision_method or "").upper())
if checker is None:
return CheckResult(present=None,
source=f"no_checker:{ctrl.decision_method}")
return await checker.check(ctrl, doc)
def build_spec(
control_id: str,
sensor_classification: Optional[dict[str, Any]],
*,
label: str = "",
criteria: Optional[list] = None,
question: str = "",
patterns: Optional[list[str]] = None,
embed_threshold: Optional[float] = None,
) -> ControlSpec:
"""Baut ein ControlSpec aus der GESPEICHERTEN sensor_classification
(canonical_controls.generation_metadata.sensor_classification) + den
Control-Kriterien. CONTENT/LLM → judge='haiku' (validierter Sufficiency-
Judge; Default für Sufficiency lt. Entscheidung 2026-06-22)."""
sc = sensor_classification or {}
vm = (sc.get("verification_method") or "").upper()
dm = (sc.get("decision_method") or "").upper()
extra: dict[str, Any] = {}
if vm == "CONTENT" and dm == "LLM":
extra["judge"] = "haiku"
return ControlSpec(
control_id=control_id,
verification_method=vm,
decision_method=dm,
label=label,
paraphrases=[str(c) for c in (criteria or []) if c],
question=question,
patterns=patterns or [],
embed_threshold=embed_threshold,
extra=extra,
)
@@ -32,32 +32,30 @@ async def judge_rescued(text: str, results: list[dict[str, Any]]) -> int:
Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht. Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht.
Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck. Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck.
""" """
from compliance.services.llm_cascade import _call_anthropic # Über den gemeinsamen Prüfer-Router (kein Cookie-Sonderfall mehr):
from compliance.services.specialist_agents.dse.deep_check import ( # CONTENT/LLM → build_spec setzt judge='haiku' → LLMChecker (validierter
_JUDGE_SYS, _build_user, _parse, # Sufficiency-Judge). Damit ist Cookie der erste echte Router-Consumer.
) from compliance.services.checkers.base import DocContext
from compliance.services.checkers.router import build_spec, route_and_check
candidates = [r for r in results if _is_rescued(r)] candidates = [r for r in results if _is_rescued(r)]
if not candidates: if not candidates:
return 0 return 0
doc = DocContext(text=text)
sc = {"verification_method": "CONTENT", "decision_method": "LLM"}
corrected = 0 corrected = 0
for r in candidates: for r in candidates:
crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""] crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""]
if not isinstance(crit, list): if not isinstance(crit, list):
crit = [str(crit)] crit = [str(crit)]
title = r.get("label") or r.get("hint") or r.get("control_id") or "" label = r.get("label") or r.get("hint") or r.get("control_id") or ""
user = _build_user(text, title, crit) spec = build_spec(r.get("control_id") or "", sc, label=label, criteria=crit)
verdict = None res = await route_and_check(spec, doc)
for _ in range(2): # retry on transient/malformed if res.present is False:
p = _parse(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400))
if p:
verdict = p
break
if verdict is not None and verdict.get("erfuellt") is False:
r["passed"] = False r["passed"] = False
r["source"] = (r.get("source") or "") + "+llm_failed" r["source"] = (r.get("source") or "") + "+llm_failed"
r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]" r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]"
r["_judge_reason"] = (verdict.get("begruendung") or "")[:200] r["_judge_reason"] = (res.evidence or "")[:200]
corrected += 1 corrected += 1
if corrected: if corrected:
logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen", logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen",
@@ -0,0 +1,51 @@
"""Prüfer-Router: build_spec aus sensor_classification + method-agnostischer
Dispatch. CONTENT/LLM -> Haiku-Sufficiency-Tier (validiert), unbekannte
decision_methods -> fail-safe present=None."""
import pytest
from unittest.mock import AsyncMock, patch
from compliance.services.checkers.base import DocContext
from compliance.services.checkers.router import build_spec, route_and_check
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
def test_build_spec_content_llm_uses_haiku():
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
label="L", criteria=["a", "b"])
assert s.verification_method == "CONTENT" and s.decision_method == "LLM"
assert s.extra.get("judge") == "haiku"
assert s.paraphrases == ["a", "b"]
def test_build_spec_embedding_no_haiku():
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "EMBEDDING"})
assert s.extra.get("judge") is None
@pytest.mark.asyncio
async def test_route_unknown_decision_is_failsafe():
s = build_spec("X", {"verification_method": "BEHAVIOR", "decision_method": "PLAYWRIGHT"})
r = await route_and_check(s, DocContext(text="x" * 200))
assert r.present is None and "no_checker" in r.source
@pytest.mark.asyncio
async def test_route_content_llm_haiku_fehlt():
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
label="Speicherdauer", criteria=["Höchstdauer pro Kategorie"])
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
with patch(_ANTHROPIC, new=fake):
r = await route_and_check(s, DocContext(text="Wir nutzen Cookies. " * 30))
assert r.present is False and r.source == "haiku"
assert fake.call_count >= 1
@pytest.mark.asyncio
async def test_route_content_llm_haiku_erfuellt():
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
label="L", criteria=["x"])
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.8}')
with patch(_ANTHROPIC, new=fake):
r = await route_and_check(s, DocContext(text="text " * 40))
assert r.present is True
@@ -8,6 +8,7 @@ from compliance.services.specialist_agents.cookie_policy._sufficiency_judge impo
) )
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic" _ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
_DOC = "Volltext der Cookie-Richtlinie mit ausreichend Inhalt. " * 4
def _r(cid, source, passed=True): def _r(cid, source, passed=True):
@@ -20,7 +21,7 @@ async def test_rescued_unpassed_when_judge_fehlt():
results = [_r("A", "keyword+embedding")] results = [_r("A", "keyword+embedding")]
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}') fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
with patch(_ANTHROPIC, new=fake): with patch(_ANTHROPIC, new=fake):
n = await judge_rescued("text", results) n = await judge_rescued(_DOC, results)
assert n == 1 assert n == 1
assert results[0]["passed"] is False assert results[0]["passed"] is False
assert "+llm_failed" in results[0]["source"] assert "+llm_failed" in results[0]["source"]
@@ -31,7 +32,7 @@ async def test_rescued_kept_when_judge_erfuellt():
results = [_r("A", "keyword+embedding")] results = [_r("A", "keyword+embedding")]
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}') fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}')
with patch(_ANTHROPIC, new=fake): with patch(_ANTHROPIC, new=fake):
n = await judge_rescued("text", results) n = await judge_rescued(_DOC, results)
assert n == 0 assert n == 0
assert results[0]["passed"] is True assert results[0]["passed"] is True
@@ -42,7 +43,7 @@ async def test_keyword_pass_not_judged():
results = [_r("A", "keyword")] results = [_r("A", "keyword")]
fake = AsyncMock(return_value='{"erfuellt": false}') fake = AsyncMock(return_value='{"erfuellt": false}')
with patch(_ANTHROPIC, new=fake): with patch(_ANTHROPIC, new=fake):
n = await judge_rescued("text", results) n = await judge_rescued(_DOC, results)
assert n == 0 assert n == 0
assert results[0]["passed"] is True assert results[0]["passed"] is True
assert fake.call_count == 0 assert fake.call_count == 0
@@ -53,7 +54,7 @@ async def test_boost_rescue_is_judged():
results = [_r("A", "keyword+regex_boost")] results = [_r("A", "keyword+regex_boost")]
fake = AsyncMock(return_value='{"erfuellt": false}') fake = AsyncMock(return_value='{"erfuellt": false}')
with patch(_ANTHROPIC, new=fake): with patch(_ANTHROPIC, new=fake):
n = await judge_rescued("text", results) n = await judge_rescued(_DOC, results)
assert n == 1 and results[0]["passed"] is False assert n == 1 and results[0]["passed"] is False
@@ -63,5 +64,5 @@ async def test_failed_controls_ignored():
results = [_r("A", "keyword+embedding", passed=False)] results = [_r("A", "keyword+embedding", passed=False)]
fake = AsyncMock(return_value='{"erfuellt": false}') fake = AsyncMock(return_value='{"erfuellt": false}')
with patch(_ANTHROPIC, new=fake): with patch(_ANTHROPIC, new=fake):
n = await judge_rescued("text", results) n = await judge_rescued(_DOC, results)
assert n == 0 and fake.call_count == 0 assert n == 0 and fake.call_count == 0