feat(checkers): platform router + Haiku sufficiency tier; cookie is first consumer
Generalise "Embedding finds, Claude decides" into the shared Pruefer-Library: - router.route_and_check dispatches control -> sensor_classification -> Checker. - build_spec reads sensor_classification (CONTENT/LLM -> judge=haiku, the validated sufficiency tier; the Qwen-first cascade is disproven for sufficiency). - LLMChecker gains a Haiku-direct tier (reuses the validated deep_check prompt). - Cookie Layer-3 now routes through route_and_check instead of bespoke code, so cookie is the first real router consumer -- proves the architecture end-to-end. Reproduces the validated result via the shared path: FN 159->14, recall 0.13->0.92, precision 0.89 (vs bespoke 12/0.93/0.90 -- within Haiku noise). Tests: 10/10 (router dispatch + build_spec + haiku tier + cookie rewire). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -45,6 +45,11 @@ class LLMChecker:
|
|||||||
text = doc.text or ""
|
text = doc.text or ""
|
||||||
if len(text) < 50:
|
if len(text) < 50:
|
||||||
return CheckResult(present=None, source="llm")
|
return CheckResult(present=None, source="llm")
|
||||||
|
# decision_method=LLM mit judge='haiku': Sufficiency-Pfad (validiert
|
||||||
|
# P0.89/R0.91). Der Qwen-first-Cascade ist als Sufficiency-Judge
|
||||||
|
# widerlegt -> hier Haiku direkt, kriteriengeführte Subsumtion.
|
||||||
|
if (ctrl.extra or {}).get("judge") == "haiku":
|
||||||
|
return await self._haiku(ctrl, text)
|
||||||
secs = _sections(text)
|
secs = _sections(text)
|
||||||
if ctrl.topic_regex:
|
if ctrl.topic_regex:
|
||||||
rel = [s for s in secs if re.search(ctrl.topic_regex, s, re.I)][:6] or secs[:6]
|
rel = [s for s in secs if re.search(ctrl.topic_regex, s, re.I)][:6] or secs[:6]
|
||||||
@@ -71,3 +76,31 @@ class LLMChecker:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.info("llm checker fail %s: %s", ctrl.control_id, str(e)[:80])
|
logger.info("llm checker fail %s: %s", ctrl.control_id, str(e)[:80])
|
||||||
return CheckResult(present=None, source="error")
|
return CheckResult(present=None, source="error")
|
||||||
|
|
||||||
|
async def _haiku(self, ctrl: ControlSpec, text: str) -> CheckResult:
|
||||||
|
"""Sufficiency via Haiku direkt (validierter Judge). Kriteriengeführt:
|
||||||
|
die Rechts-Elemente stehen in ctrl.paraphrases; wiederverwendet den
|
||||||
|
validierten deep_check-Sufficiency-Prompt."""
|
||||||
|
try:
|
||||||
|
from compliance.services.llm_cascade import _call_anthropic
|
||||||
|
from compliance.services.specialist_agents.dse.deep_check import (
|
||||||
|
_JUDGE_SYS, _build_user, _parse as _parse_judge,
|
||||||
|
)
|
||||||
|
crit = ctrl.paraphrases or [ctrl.label or ctrl.control_id]
|
||||||
|
user = _build_user(text, ctrl.label or ctrl.control_id, crit)
|
||||||
|
obj = None
|
||||||
|
for _ in range(2):
|
||||||
|
obj = _parse_judge(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400))
|
||||||
|
if obj:
|
||||||
|
break
|
||||||
|
if not obj:
|
||||||
|
return CheckResult(present=None, source="haiku")
|
||||||
|
return CheckResult(
|
||||||
|
present=bool(obj.get("erfuellt")),
|
||||||
|
evidence=(obj.get("begruendung") or "")[:120],
|
||||||
|
confidence=float(obj.get("confidence") or 0.0),
|
||||||
|
source="haiku",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.info("llm haiku checker fail %s: %s", ctrl.control_id, str(e)[:80])
|
||||||
|
return CheckResult(present=None, source="error")
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
"""Prüfer-Router — method-agnostischer Dispatch.
|
||||||
|
|
||||||
|
control → sensor_classification (verification_method + decision_method) → Checker.
|
||||||
|
Ein neues Modul liefert nur ControlSpecs; der Router wählt den Prüfer. Damit wird
|
||||||
|
der „Embedding findet, Claude entscheidet"-Pfad EIN gemeinsamer CONTENT/LLM-Prüfer
|
||||||
|
statt Cookie-Sonderlogik. Nicht-gebaute Prüfer (PLAYWRIGHT/AUDIT/SCANNER/REGEX-
|
||||||
|
FIELD) → present=None (fail-safe: Aufrufer behält sein deterministisches Ergebnis).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from .base import CheckResult, ControlSpec, DecisionMethod, DocContext
|
||||||
|
from .embedding_checker import EmbeddingChecker
|
||||||
|
from .llm_checker import LLMChecker
|
||||||
|
from .reference_checker import ReferenceChecker
|
||||||
|
|
||||||
|
_LLM = LLMChecker()
|
||||||
|
_EMB = EmbeddingChecker()
|
||||||
|
_REF = ReferenceChecker()
|
||||||
|
|
||||||
|
# decision_method → Checker. Fehlende Mechanismen bewusst None (noch nicht gebaut).
|
||||||
|
_BY_DECISION: dict[str, Any] = {
|
||||||
|
DecisionMethod.LLM: _LLM,
|
||||||
|
DecisionMethod.EMBEDDING: _EMB,
|
||||||
|
DecisionMethod.LINK_RESOLVER: _REF,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def route_and_check(ctrl: ControlSpec, doc: DocContext) -> CheckResult:
|
||||||
|
checker = _BY_DECISION.get((ctrl.decision_method or "").upper())
|
||||||
|
if checker is None:
|
||||||
|
return CheckResult(present=None,
|
||||||
|
source=f"no_checker:{ctrl.decision_method}")
|
||||||
|
return await checker.check(ctrl, doc)
|
||||||
|
|
||||||
|
|
||||||
|
def build_spec(
|
||||||
|
control_id: str,
|
||||||
|
sensor_classification: Optional[dict[str, Any]],
|
||||||
|
*,
|
||||||
|
label: str = "",
|
||||||
|
criteria: Optional[list] = None,
|
||||||
|
question: str = "",
|
||||||
|
patterns: Optional[list[str]] = None,
|
||||||
|
embed_threshold: Optional[float] = None,
|
||||||
|
) -> ControlSpec:
|
||||||
|
"""Baut ein ControlSpec aus der GESPEICHERTEN sensor_classification
|
||||||
|
(canonical_controls.generation_metadata.sensor_classification) + den
|
||||||
|
Control-Kriterien. CONTENT/LLM → judge='haiku' (validierter Sufficiency-
|
||||||
|
Judge; Default für Sufficiency lt. Entscheidung 2026-06-22)."""
|
||||||
|
sc = sensor_classification or {}
|
||||||
|
vm = (sc.get("verification_method") or "").upper()
|
||||||
|
dm = (sc.get("decision_method") or "").upper()
|
||||||
|
extra: dict[str, Any] = {}
|
||||||
|
if vm == "CONTENT" and dm == "LLM":
|
||||||
|
extra["judge"] = "haiku"
|
||||||
|
return ControlSpec(
|
||||||
|
control_id=control_id,
|
||||||
|
verification_method=vm,
|
||||||
|
decision_method=dm,
|
||||||
|
label=label,
|
||||||
|
paraphrases=[str(c) for c in (criteria or []) if c],
|
||||||
|
question=question,
|
||||||
|
patterns=patterns or [],
|
||||||
|
embed_threshold=embed_threshold,
|
||||||
|
extra=extra,
|
||||||
|
)
|
||||||
+12
-14
@@ -32,32 +32,30 @@ async def judge_rescued(text: str, results: list[dict[str, Any]]) -> int:
|
|||||||
Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht.
|
Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht.
|
||||||
Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck.
|
Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck.
|
||||||
"""
|
"""
|
||||||
from compliance.services.llm_cascade import _call_anthropic
|
# Über den gemeinsamen Prüfer-Router (kein Cookie-Sonderfall mehr):
|
||||||
from compliance.services.specialist_agents.dse.deep_check import (
|
# CONTENT/LLM → build_spec setzt judge='haiku' → LLMChecker (validierter
|
||||||
_JUDGE_SYS, _build_user, _parse,
|
# Sufficiency-Judge). Damit ist Cookie der erste echte Router-Consumer.
|
||||||
)
|
from compliance.services.checkers.base import DocContext
|
||||||
|
from compliance.services.checkers.router import build_spec, route_and_check
|
||||||
|
|
||||||
candidates = [r for r in results if _is_rescued(r)]
|
candidates = [r for r in results if _is_rescued(r)]
|
||||||
if not candidates:
|
if not candidates:
|
||||||
return 0
|
return 0
|
||||||
|
doc = DocContext(text=text)
|
||||||
|
sc = {"verification_method": "CONTENT", "decision_method": "LLM"}
|
||||||
corrected = 0
|
corrected = 0
|
||||||
for r in candidates:
|
for r in candidates:
|
||||||
crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""]
|
crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""]
|
||||||
if not isinstance(crit, list):
|
if not isinstance(crit, list):
|
||||||
crit = [str(crit)]
|
crit = [str(crit)]
|
||||||
title = r.get("label") or r.get("hint") or r.get("control_id") or ""
|
label = r.get("label") or r.get("hint") or r.get("control_id") or ""
|
||||||
user = _build_user(text, title, crit)
|
spec = build_spec(r.get("control_id") or "", sc, label=label, criteria=crit)
|
||||||
verdict = None
|
res = await route_and_check(spec, doc)
|
||||||
for _ in range(2): # retry on transient/malformed
|
if res.present is False:
|
||||||
p = _parse(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400))
|
|
||||||
if p:
|
|
||||||
verdict = p
|
|
||||||
break
|
|
||||||
if verdict is not None and verdict.get("erfuellt") is False:
|
|
||||||
r["passed"] = False
|
r["passed"] = False
|
||||||
r["source"] = (r.get("source") or "") + "+llm_failed"
|
r["source"] = (r.get("source") or "") + "+llm_failed"
|
||||||
r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]"
|
r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]"
|
||||||
r["_judge_reason"] = (verdict.get("begruendung") or "")[:200]
|
r["_judge_reason"] = (res.evidence or "")[:200]
|
||||||
corrected += 1
|
corrected += 1
|
||||||
if corrected:
|
if corrected:
|
||||||
logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen",
|
logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen",
|
||||||
|
|||||||
@@ -0,0 +1,51 @@
|
|||||||
|
"""Prüfer-Router: build_spec aus sensor_classification + method-agnostischer
|
||||||
|
Dispatch. CONTENT/LLM -> Haiku-Sufficiency-Tier (validiert), unbekannte
|
||||||
|
decision_methods -> fail-safe present=None."""
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
|
from compliance.services.checkers.base import DocContext
|
||||||
|
from compliance.services.checkers.router import build_spec, route_and_check
|
||||||
|
|
||||||
|
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_spec_content_llm_uses_haiku():
|
||||||
|
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
|
||||||
|
label="L", criteria=["a", "b"])
|
||||||
|
assert s.verification_method == "CONTENT" and s.decision_method == "LLM"
|
||||||
|
assert s.extra.get("judge") == "haiku"
|
||||||
|
assert s.paraphrases == ["a", "b"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_spec_embedding_no_haiku():
|
||||||
|
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "EMBEDDING"})
|
||||||
|
assert s.extra.get("judge") is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_route_unknown_decision_is_failsafe():
|
||||||
|
s = build_spec("X", {"verification_method": "BEHAVIOR", "decision_method": "PLAYWRIGHT"})
|
||||||
|
r = await route_and_check(s, DocContext(text="x" * 200))
|
||||||
|
assert r.present is None and "no_checker" in r.source
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_route_content_llm_haiku_fehlt():
|
||||||
|
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
|
||||||
|
label="Speicherdauer", criteria=["Höchstdauer pro Kategorie"])
|
||||||
|
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
|
||||||
|
with patch(_ANTHROPIC, new=fake):
|
||||||
|
r = await route_and_check(s, DocContext(text="Wir nutzen Cookies. " * 30))
|
||||||
|
assert r.present is False and r.source == "haiku"
|
||||||
|
assert fake.call_count >= 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_route_content_llm_haiku_erfuellt():
|
||||||
|
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
|
||||||
|
label="L", criteria=["x"])
|
||||||
|
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.8}')
|
||||||
|
with patch(_ANTHROPIC, new=fake):
|
||||||
|
r = await route_and_check(s, DocContext(text="text " * 40))
|
||||||
|
assert r.present is True
|
||||||
@@ -8,6 +8,7 @@ from compliance.services.specialist_agents.cookie_policy._sufficiency_judge impo
|
|||||||
)
|
)
|
||||||
|
|
||||||
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
|
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
|
||||||
|
_DOC = "Volltext der Cookie-Richtlinie mit ausreichend Inhalt. " * 4
|
||||||
|
|
||||||
|
|
||||||
def _r(cid, source, passed=True):
|
def _r(cid, source, passed=True):
|
||||||
@@ -20,7 +21,7 @@ async def test_rescued_unpassed_when_judge_fehlt():
|
|||||||
results = [_r("A", "keyword+embedding")]
|
results = [_r("A", "keyword+embedding")]
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
|
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
|
||||||
with patch(_ANTHROPIC, new=fake):
|
with patch(_ANTHROPIC, new=fake):
|
||||||
n = await judge_rescued("text", results)
|
n = await judge_rescued(_DOC, results)
|
||||||
assert n == 1
|
assert n == 1
|
||||||
assert results[0]["passed"] is False
|
assert results[0]["passed"] is False
|
||||||
assert "+llm_failed" in results[0]["source"]
|
assert "+llm_failed" in results[0]["source"]
|
||||||
@@ -31,7 +32,7 @@ async def test_rescued_kept_when_judge_erfuellt():
|
|||||||
results = [_r("A", "keyword+embedding")]
|
results = [_r("A", "keyword+embedding")]
|
||||||
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}')
|
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}')
|
||||||
with patch(_ANTHROPIC, new=fake):
|
with patch(_ANTHROPIC, new=fake):
|
||||||
n = await judge_rescued("text", results)
|
n = await judge_rescued(_DOC, results)
|
||||||
assert n == 0
|
assert n == 0
|
||||||
assert results[0]["passed"] is True
|
assert results[0]["passed"] is True
|
||||||
|
|
||||||
@@ -42,7 +43,7 @@ async def test_keyword_pass_not_judged():
|
|||||||
results = [_r("A", "keyword")]
|
results = [_r("A", "keyword")]
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
fake = AsyncMock(return_value='{"erfuellt": false}')
|
||||||
with patch(_ANTHROPIC, new=fake):
|
with patch(_ANTHROPIC, new=fake):
|
||||||
n = await judge_rescued("text", results)
|
n = await judge_rescued(_DOC, results)
|
||||||
assert n == 0
|
assert n == 0
|
||||||
assert results[0]["passed"] is True
|
assert results[0]["passed"] is True
|
||||||
assert fake.call_count == 0
|
assert fake.call_count == 0
|
||||||
@@ -53,7 +54,7 @@ async def test_boost_rescue_is_judged():
|
|||||||
results = [_r("A", "keyword+regex_boost")]
|
results = [_r("A", "keyword+regex_boost")]
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
fake = AsyncMock(return_value='{"erfuellt": false}')
|
||||||
with patch(_ANTHROPIC, new=fake):
|
with patch(_ANTHROPIC, new=fake):
|
||||||
n = await judge_rescued("text", results)
|
n = await judge_rescued(_DOC, results)
|
||||||
assert n == 1 and results[0]["passed"] is False
|
assert n == 1 and results[0]["passed"] is False
|
||||||
|
|
||||||
|
|
||||||
@@ -63,5 +64,5 @@ async def test_failed_controls_ignored():
|
|||||||
results = [_r("A", "keyword+embedding", passed=False)]
|
results = [_r("A", "keyword+embedding", passed=False)]
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
fake = AsyncMock(return_value='{"erfuellt": false}')
|
||||||
with patch(_ANTHROPIC, new=fake):
|
with patch(_ANTHROPIC, new=fake):
|
||||||
n = await judge_rescued("text", results)
|
n = await judge_rescued(_DOC, results)
|
||||||
assert n == 0 and fake.call_count == 0
|
assert n == 0 and fake.call_count == 0
|
||||||
|
|||||||
Reference in New Issue
Block a user