feat(checkers): platform router + Haiku sufficiency tier; cookie is first consumer

Generalise "Embedding finds, Claude decides" into the shared Pruefer-Library:
- router.route_and_check dispatches control -> sensor_classification -> Checker.
- build_spec reads sensor_classification (CONTENT/LLM -> judge=haiku, the
  validated sufficiency tier; the Qwen-first cascade is disproven for sufficiency).
- LLMChecker gains a Haiku-direct tier (reuses the validated deep_check prompt).
- Cookie Layer-3 now routes through route_and_check instead of bespoke code, so
  cookie is the first real router consumer -- proves the architecture end-to-end.

Reproduces the validated result via the shared path: FN 159->14, recall
0.13->0.92, precision 0.89 (vs bespoke 12/0.93/0.90 -- within Haiku noise).
Tests: 10/10 (router dispatch + build_spec + haiku tier + cookie rewire).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-22 12:56:14 +02:00
parent e809d0bc1c
commit 3e3644f83d
5 changed files with 170 additions and 19 deletions
@@ -45,6 +45,11 @@ class LLMChecker:
text = doc.text or ""
if len(text) < 50:
return CheckResult(present=None, source="llm")
# decision_method=LLM mit judge='haiku': Sufficiency-Pfad (validiert
# P0.89/R0.91). Der Qwen-first-Cascade ist als Sufficiency-Judge
# widerlegt -> hier Haiku direkt, kriteriengeführte Subsumtion.
if (ctrl.extra or {}).get("judge") == "haiku":
return await self._haiku(ctrl, text)
secs = _sections(text)
if ctrl.topic_regex:
rel = [s for s in secs if re.search(ctrl.topic_regex, s, re.I)][:6] or secs[:6]
@@ -71,3 +76,31 @@ class LLMChecker:
except Exception as e:
logger.info("llm checker fail %s: %s", ctrl.control_id, str(e)[:80])
return CheckResult(present=None, source="error")
async def _haiku(self, ctrl: ControlSpec, text: str) -> CheckResult:
"""Sufficiency via Haiku direkt (validierter Judge). Kriteriengeführt:
die Rechts-Elemente stehen in ctrl.paraphrases; wiederverwendet den
validierten deep_check-Sufficiency-Prompt."""
try:
from compliance.services.llm_cascade import _call_anthropic
from compliance.services.specialist_agents.dse.deep_check import (
_JUDGE_SYS, _build_user, _parse as _parse_judge,
)
crit = ctrl.paraphrases or [ctrl.label or ctrl.control_id]
user = _build_user(text, ctrl.label or ctrl.control_id, crit)
obj = None
for _ in range(2):
obj = _parse_judge(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400))
if obj:
break
if not obj:
return CheckResult(present=None, source="haiku")
return CheckResult(
present=bool(obj.get("erfuellt")),
evidence=(obj.get("begruendung") or "")[:120],
confidence=float(obj.get("confidence") or 0.0),
source="haiku",
)
except Exception as e:
logger.info("llm haiku checker fail %s: %s", ctrl.control_id, str(e)[:80])
return CheckResult(present=None, source="error")
@@ -0,0 +1,68 @@
"""Prüfer-Router — method-agnostischer Dispatch.
control → sensor_classification (verification_method + decision_method) → Checker.
Ein neues Modul liefert nur ControlSpecs; der Router wählt den Prüfer. Damit wird
der „Embedding findet, Claude entscheidet"-Pfad EIN gemeinsamer CONTENT/LLM-Prüfer
statt Cookie-Sonderlogik. Nicht-gebaute Prüfer (PLAYWRIGHT/AUDIT/SCANNER/REGEX-
FIELD) → present=None (fail-safe: Aufrufer behält sein deterministisches Ergebnis).
"""
from __future__ import annotations
from typing import Any, Optional
from .base import CheckResult, ControlSpec, DecisionMethod, DocContext
from .embedding_checker import EmbeddingChecker
from .llm_checker import LLMChecker
from .reference_checker import ReferenceChecker
_LLM = LLMChecker()
_EMB = EmbeddingChecker()
_REF = ReferenceChecker()
# decision_method → Checker. Fehlende Mechanismen bewusst None (noch nicht gebaut).
_BY_DECISION: dict[str, Any] = {
DecisionMethod.LLM: _LLM,
DecisionMethod.EMBEDDING: _EMB,
DecisionMethod.LINK_RESOLVER: _REF,
}
async def route_and_check(ctrl: ControlSpec, doc: DocContext) -> CheckResult:
checker = _BY_DECISION.get((ctrl.decision_method or "").upper())
if checker is None:
return CheckResult(present=None,
source=f"no_checker:{ctrl.decision_method}")
return await checker.check(ctrl, doc)
def build_spec(
control_id: str,
sensor_classification: Optional[dict[str, Any]],
*,
label: str = "",
criteria: Optional[list] = None,
question: str = "",
patterns: Optional[list[str]] = None,
embed_threshold: Optional[float] = None,
) -> ControlSpec:
"""Baut ein ControlSpec aus der GESPEICHERTEN sensor_classification
(canonical_controls.generation_metadata.sensor_classification) + den
Control-Kriterien. CONTENT/LLM → judge='haiku' (validierter Sufficiency-
Judge; Default für Sufficiency lt. Entscheidung 2026-06-22)."""
sc = sensor_classification or {}
vm = (sc.get("verification_method") or "").upper()
dm = (sc.get("decision_method") or "").upper()
extra: dict[str, Any] = {}
if vm == "CONTENT" and dm == "LLM":
extra["judge"] = "haiku"
return ControlSpec(
control_id=control_id,
verification_method=vm,
decision_method=dm,
label=label,
paraphrases=[str(c) for c in (criteria or []) if c],
question=question,
patterns=patterns or [],
embed_threshold=embed_threshold,
extra=extra,
)
@@ -32,32 +32,30 @@ async def judge_rescued(text: str, results: list[dict[str, Any]]) -> int:
Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht.
Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck.
"""
from compliance.services.llm_cascade import _call_anthropic
from compliance.services.specialist_agents.dse.deep_check import (
_JUDGE_SYS, _build_user, _parse,
)
# Über den gemeinsamen Prüfer-Router (kein Cookie-Sonderfall mehr):
# CONTENT/LLM → build_spec setzt judge='haiku' → LLMChecker (validierter
# Sufficiency-Judge). Damit ist Cookie der erste echte Router-Consumer.
from compliance.services.checkers.base import DocContext
from compliance.services.checkers.router import build_spec, route_and_check
candidates = [r for r in results if _is_rescued(r)]
if not candidates:
return 0
doc = DocContext(text=text)
sc = {"verification_method": "CONTENT", "decision_method": "LLM"}
corrected = 0
for r in candidates:
crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""]
if not isinstance(crit, list):
crit = [str(crit)]
title = r.get("label") or r.get("hint") or r.get("control_id") or ""
user = _build_user(text, title, crit)
verdict = None
for _ in range(2): # retry on transient/malformed
p = _parse(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400))
if p:
verdict = p
break
if verdict is not None and verdict.get("erfuellt") is False:
label = r.get("label") or r.get("hint") or r.get("control_id") or ""
spec = build_spec(r.get("control_id") or "", sc, label=label, criteria=crit)
res = await route_and_check(spec, doc)
if res.present is False:
r["passed"] = False
r["source"] = (r.get("source") or "") + "+llm_failed"
r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]"
r["_judge_reason"] = (verdict.get("begruendung") or "")[:200]
r["_judge_reason"] = (res.evidence or "")[:200]
corrected += 1
if corrected:
logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen",