diff --git a/backend-compliance/compliance/services/checkers/llm_checker.py b/backend-compliance/compliance/services/checkers/llm_checker.py index ec8bc5d0..278376d5 100644 --- a/backend-compliance/compliance/services/checkers/llm_checker.py +++ b/backend-compliance/compliance/services/checkers/llm_checker.py @@ -45,6 +45,11 @@ class LLMChecker: text = doc.text or "" if len(text) < 50: return CheckResult(present=None, source="llm") + # decision_method=LLM mit judge='haiku': Sufficiency-Pfad (validiert + # P0.89/R0.91). Der Qwen-first-Cascade ist als Sufficiency-Judge + # widerlegt -> hier Haiku direkt, kriteriengeführte Subsumtion. + if (ctrl.extra or {}).get("judge") == "haiku": + return await self._haiku(ctrl, text) secs = _sections(text) if ctrl.topic_regex: rel = [s for s in secs if re.search(ctrl.topic_regex, s, re.I)][:6] or secs[:6] @@ -71,3 +76,31 @@ class LLMChecker: except Exception as e: logger.info("llm checker fail %s: %s", ctrl.control_id, str(e)[:80]) return CheckResult(present=None, source="error") + + async def _haiku(self, ctrl: ControlSpec, text: str) -> CheckResult: + """Sufficiency via Haiku direkt (validierter Judge). Kriteriengeführt: + die Rechts-Elemente stehen in ctrl.paraphrases; wiederverwendet den + validierten deep_check-Sufficiency-Prompt.""" + try: + from compliance.services.llm_cascade import _call_anthropic + from compliance.services.specialist_agents.dse.deep_check import ( + _JUDGE_SYS, _build_user, _parse as _parse_judge, + ) + crit = ctrl.paraphrases or [ctrl.label or ctrl.control_id] + user = _build_user(text, ctrl.label or ctrl.control_id, crit) + obj = None + for _ in range(2): + obj = _parse_judge(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400)) + if obj: + break + if not obj: + return CheckResult(present=None, source="haiku") + return CheckResult( + present=bool(obj.get("erfuellt")), + evidence=(obj.get("begruendung") or "")[:120], + confidence=float(obj.get("confidence") or 0.0), + source="haiku", + ) + except Exception as e: + logger.info("llm haiku checker fail %s: %s", ctrl.control_id, str(e)[:80]) + return CheckResult(present=None, source="error") diff --git a/backend-compliance/compliance/services/checkers/router.py b/backend-compliance/compliance/services/checkers/router.py new file mode 100644 index 00000000..a9d7cb04 --- /dev/null +++ b/backend-compliance/compliance/services/checkers/router.py @@ -0,0 +1,68 @@ +"""Prüfer-Router — method-agnostischer Dispatch. + +control → sensor_classification (verification_method + decision_method) → Checker. +Ein neues Modul liefert nur ControlSpecs; der Router wählt den Prüfer. Damit wird +der „Embedding findet, Claude entscheidet"-Pfad EIN gemeinsamer CONTENT/LLM-Prüfer +statt Cookie-Sonderlogik. Nicht-gebaute Prüfer (PLAYWRIGHT/AUDIT/SCANNER/REGEX- +FIELD) → present=None (fail-safe: Aufrufer behält sein deterministisches Ergebnis). +""" +from __future__ import annotations + +from typing import Any, Optional + +from .base import CheckResult, ControlSpec, DecisionMethod, DocContext +from .embedding_checker import EmbeddingChecker +from .llm_checker import LLMChecker +from .reference_checker import ReferenceChecker + +_LLM = LLMChecker() +_EMB = EmbeddingChecker() +_REF = ReferenceChecker() + +# decision_method → Checker. Fehlende Mechanismen bewusst None (noch nicht gebaut). +_BY_DECISION: dict[str, Any] = { + DecisionMethod.LLM: _LLM, + DecisionMethod.EMBEDDING: _EMB, + DecisionMethod.LINK_RESOLVER: _REF, +} + + +async def route_and_check(ctrl: ControlSpec, doc: DocContext) -> CheckResult: + checker = _BY_DECISION.get((ctrl.decision_method or "").upper()) + if checker is None: + return CheckResult(present=None, + source=f"no_checker:{ctrl.decision_method}") + return await checker.check(ctrl, doc) + + +def build_spec( + control_id: str, + sensor_classification: Optional[dict[str, Any]], + *, + label: str = "", + criteria: Optional[list] = None, + question: str = "", + patterns: Optional[list[str]] = None, + embed_threshold: Optional[float] = None, +) -> ControlSpec: + """Baut ein ControlSpec aus der GESPEICHERTEN sensor_classification + (canonical_controls.generation_metadata.sensor_classification) + den + Control-Kriterien. CONTENT/LLM → judge='haiku' (validierter Sufficiency- + Judge; Default für Sufficiency lt. Entscheidung 2026-06-22).""" + sc = sensor_classification or {} + vm = (sc.get("verification_method") or "").upper() + dm = (sc.get("decision_method") or "").upper() + extra: dict[str, Any] = {} + if vm == "CONTENT" and dm == "LLM": + extra["judge"] = "haiku" + return ControlSpec( + control_id=control_id, + verification_method=vm, + decision_method=dm, + label=label, + paraphrases=[str(c) for c in (criteria or []) if c], + question=question, + patterns=patterns or [], + embed_threshold=embed_threshold, + extra=extra, + ) diff --git a/backend-compliance/compliance/services/specialist_agents/cookie_policy/_sufficiency_judge.py b/backend-compliance/compliance/services/specialist_agents/cookie_policy/_sufficiency_judge.py index 3a26d566..df22b933 100644 --- a/backend-compliance/compliance/services/specialist_agents/cookie_policy/_sufficiency_judge.py +++ b/backend-compliance/compliance/services/specialist_agents/cookie_policy/_sufficiency_judge.py @@ -32,32 +32,30 @@ async def judge_rescued(text: str, results: list[dict[str, Any]]) -> int: Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht. Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck. """ - from compliance.services.llm_cascade import _call_anthropic - from compliance.services.specialist_agents.dse.deep_check import ( - _JUDGE_SYS, _build_user, _parse, - ) + # Über den gemeinsamen Prüfer-Router (kein Cookie-Sonderfall mehr): + # CONTENT/LLM → build_spec setzt judge='haiku' → LLMChecker (validierter + # Sufficiency-Judge). Damit ist Cookie der erste echte Router-Consumer. + from compliance.services.checkers.base import DocContext + from compliance.services.checkers.router import build_spec, route_and_check candidates = [r for r in results if _is_rescued(r)] if not candidates: return 0 + doc = DocContext(text=text) + sc = {"verification_method": "CONTENT", "decision_method": "LLM"} corrected = 0 for r in candidates: crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""] if not isinstance(crit, list): crit = [str(crit)] - title = r.get("label") or r.get("hint") or r.get("control_id") or "" - user = _build_user(text, title, crit) - verdict = None - for _ in range(2): # retry on transient/malformed - p = _parse(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400)) - if p: - verdict = p - break - if verdict is not None and verdict.get("erfuellt") is False: + label = r.get("label") or r.get("hint") or r.get("control_id") or "" + spec = build_spec(r.get("control_id") or "", sc, label=label, criteria=crit) + res = await route_and_check(spec, doc) + if res.present is False: r["passed"] = False r["source"] = (r.get("source") or "") + "+llm_failed" r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]" - r["_judge_reason"] = (verdict.get("begruendung") or "")[:200] + r["_judge_reason"] = (res.evidence or "")[:200] corrected += 1 if corrected: logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen", diff --git a/backend-compliance/tests/test_checker_router.py b/backend-compliance/tests/test_checker_router.py new file mode 100644 index 00000000..4d551508 --- /dev/null +++ b/backend-compliance/tests/test_checker_router.py @@ -0,0 +1,51 @@ +"""Prüfer-Router: build_spec aus sensor_classification + method-agnostischer +Dispatch. CONTENT/LLM -> Haiku-Sufficiency-Tier (validiert), unbekannte +decision_methods -> fail-safe present=None.""" +import pytest +from unittest.mock import AsyncMock, patch + +from compliance.services.checkers.base import DocContext +from compliance.services.checkers.router import build_spec, route_and_check + +_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic" + + +def test_build_spec_content_llm_uses_haiku(): + s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"}, + label="L", criteria=["a", "b"]) + assert s.verification_method == "CONTENT" and s.decision_method == "LLM" + assert s.extra.get("judge") == "haiku" + assert s.paraphrases == ["a", "b"] + + +def test_build_spec_embedding_no_haiku(): + s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "EMBEDDING"}) + assert s.extra.get("judge") is None + + +@pytest.mark.asyncio +async def test_route_unknown_decision_is_failsafe(): + s = build_spec("X", {"verification_method": "BEHAVIOR", "decision_method": "PLAYWRIGHT"}) + r = await route_and_check(s, DocContext(text="x" * 200)) + assert r.present is None and "no_checker" in r.source + + +@pytest.mark.asyncio +async def test_route_content_llm_haiku_fehlt(): + s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"}, + label="Speicherdauer", criteria=["Höchstdauer pro Kategorie"]) + fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}') + with patch(_ANTHROPIC, new=fake): + r = await route_and_check(s, DocContext(text="Wir nutzen Cookies. " * 30)) + assert r.present is False and r.source == "haiku" + assert fake.call_count >= 1 + + +@pytest.mark.asyncio +async def test_route_content_llm_haiku_erfuellt(): + s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"}, + label="L", criteria=["x"]) + fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.8}') + with patch(_ANTHROPIC, new=fake): + r = await route_and_check(s, DocContext(text="text " * 40)) + assert r.present is True diff --git a/backend-compliance/tests/test_cookie_sufficiency_judge.py b/backend-compliance/tests/test_cookie_sufficiency_judge.py index 95a00877..d3d2c15c 100644 --- a/backend-compliance/tests/test_cookie_sufficiency_judge.py +++ b/backend-compliance/tests/test_cookie_sufficiency_judge.py @@ -8,6 +8,7 @@ from compliance.services.specialist_agents.cookie_policy._sufficiency_judge impo ) _ANTHROPIC = "compliance.services.llm_cascade._call_anthropic" +_DOC = "Volltext der Cookie-Richtlinie mit ausreichend Inhalt. " * 4 def _r(cid, source, passed=True): @@ -20,7 +21,7 @@ async def test_rescued_unpassed_when_judge_fehlt(): results = [_r("A", "keyword+embedding")] fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}') with patch(_ANTHROPIC, new=fake): - n = await judge_rescued("text", results) + n = await judge_rescued(_DOC, results) assert n == 1 assert results[0]["passed"] is False assert "+llm_failed" in results[0]["source"] @@ -31,7 +32,7 @@ async def test_rescued_kept_when_judge_erfuellt(): results = [_r("A", "keyword+embedding")] fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}') with patch(_ANTHROPIC, new=fake): - n = await judge_rescued("text", results) + n = await judge_rescued(_DOC, results) assert n == 0 assert results[0]["passed"] is True @@ -42,7 +43,7 @@ async def test_keyword_pass_not_judged(): results = [_r("A", "keyword")] fake = AsyncMock(return_value='{"erfuellt": false}') with patch(_ANTHROPIC, new=fake): - n = await judge_rescued("text", results) + n = await judge_rescued(_DOC, results) assert n == 0 assert results[0]["passed"] is True assert fake.call_count == 0 @@ -53,7 +54,7 @@ async def test_boost_rescue_is_judged(): results = [_r("A", "keyword+regex_boost")] fake = AsyncMock(return_value='{"erfuellt": false}') with patch(_ANTHROPIC, new=fake): - n = await judge_rescued("text", results) + n = await judge_rescued(_DOC, results) assert n == 1 and results[0]["passed"] is False @@ -63,5 +64,5 @@ async def test_failed_controls_ignored(): results = [_r("A", "keyword+embedding", passed=False)] fake = AsyncMock(return_value='{"erfuellt": false}') with patch(_ANTHROPIC, new=fake): - n = await judge_rescued("text", results) + n = await judge_rescued(_DOC, results) assert n == 0 and fake.call_count == 0