feat(dse): tiered 3-state evaluator + Layer-3 wiring (compliance_tier)
Getierte Auswertung mit compliance_tier-Gating (nur LEGAL_MINIMUM bestimmt ERFÜLLT/TEILWEISE/FEHLT; BEST_PRACTICE/OPTIONAL → Empfehlungen). Deterministisch- first: EMBEDDING-Präsenz + gecachter Haiku nur für Sufficiency → reproduzierbar (löst die gemessene Judge-Varianz). Layer-3 in v3_engine gated auf tiered_criteria, fail-safe (UNBESTIMMT → Legacy). Offene Kalibrierung: Präsenz-Schwelle (Schritt 2). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,102 @@
|
||||
"""Unit-Tests für die getierte 3-Status-Auswertung (_tiered_eval).
|
||||
|
||||
Deckt ab: Status-Logik (inkl. kein-LM → ERFÜLLT, UNBESTIMMT bei nicht bewertbar),
|
||||
Empfehlungs-Sammlung, EMBEDDING/LLM-Routing (gemockt) und den Reproduzierbarkeits-
|
||||
Cache. Embedding/LLM werden gemockt — kein Netzwerk."""
|
||||
import asyncio
|
||||
|
||||
from compliance.services.specialist_agents.dse import _tiered_eval as te
|
||||
|
||||
|
||||
# ---- reine Status-Logik -------------------------------------------------
|
||||
def test_status_no_lm_is_erfuellt():
|
||||
assert te._status([]) == "ERFÜLLT"
|
||||
|
||||
|
||||
def test_status_all_met_erfuellt():
|
||||
assert te._status([True, True]) == "ERFÜLLT"
|
||||
|
||||
|
||||
def test_status_none_met_fehlt():
|
||||
assert te._status([False, False]) == "FEHLT"
|
||||
|
||||
|
||||
def test_status_partial_teilweise():
|
||||
assert te._status([True, False]) == "TEILWEISE"
|
||||
|
||||
|
||||
def test_status_any_none_unbestimmt():
|
||||
assert te._status([True, None]) == "UNBESTIMMT"
|
||||
|
||||
|
||||
# ---- evaluate_tiered (Embedding/LLM gemockt) ----------------------------
|
||||
def _crit(text, tier, dm="EMBEDDING"):
|
||||
return {"criterion": text, "compliance_tier": tier,
|
||||
"decision_method": dm, "legal_basis": "x"}
|
||||
|
||||
|
||||
class _Doc:
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
|
||||
|
||||
def test_evaluate_partial_with_recommendation(monkeypatch):
|
||||
crits = [_crit("Zwecke genannt", "LEGAL_MINIMUM"),
|
||||
_crit("Speicherdauer genannt", "LEGAL_MINIMUM"),
|
||||
_crit("tabellarisch ausgewiesen", "BEST_PRACTICE")]
|
||||
|
||||
async def fake_embed(texts, ctx, thr):
|
||||
return {"Zwecke genannt": True, "Speicherdauer genannt": False,
|
||||
"tabellarisch ausgewiesen": False}
|
||||
|
||||
monkeypatch.setattr(te, "_embed_present", fake_embed)
|
||||
out = asyncio.run(te.evaluate_tiered("C1", crits, {"hash": "h"}, _Doc("x" * 200)))
|
||||
assert out["status"] == "TEILWEISE"
|
||||
assert out["lm_met"] == 1 and out["lm_total"] == 2
|
||||
assert len(out["recommendations"]) == 1
|
||||
assert out["recommendations"][0]["tier"] == "BEST_PRACTICE"
|
||||
|
||||
|
||||
def test_evaluate_no_lm_is_erfuellt_with_recs(monkeypatch):
|
||||
crits = [_crit("Bildsymbole", "OPTIONAL"), _crit("Legende", "OPTIONAL")]
|
||||
|
||||
async def fake_embed(texts, ctx, thr):
|
||||
return {t: False for t in texts}
|
||||
|
||||
monkeypatch.setattr(te, "_embed_present", fake_embed)
|
||||
out = asyncio.run(te.evaluate_tiered("C2", crits, {"hash": "h"}, _Doc("x" * 200)))
|
||||
assert out["status"] == "ERFÜLLT"
|
||||
assert out["lm_total"] == 0
|
||||
assert len(out["recommendations"]) == 2
|
||||
|
||||
|
||||
def test_evaluate_llm_criterion_routed(monkeypatch):
|
||||
crits = [_crit("Speicherdauer hinreichend nachvollziehbar", "LEGAL_MINIMUM", dm="LLM")]
|
||||
|
||||
async def fake_llm(cid, idx, crit, doc, dh):
|
||||
return True
|
||||
|
||||
monkeypatch.setattr(te, "_llm_met", fake_llm)
|
||||
out = asyncio.run(te.evaluate_tiered("C3", crits, {"hash": "h"}, _Doc("x" * 200)))
|
||||
assert out["status"] == "ERFÜLLT" and out["lm_total"] == 1
|
||||
|
||||
|
||||
def test_evaluate_unbestimmt_when_embed_unavailable(monkeypatch):
|
||||
crits = [_crit("Zwecke genannt", "LEGAL_MINIMUM")]
|
||||
|
||||
async def fake_embed(texts, ctx, thr):
|
||||
return {t: None for t in texts} # Embedding-Service down
|
||||
|
||||
monkeypatch.setattr(te, "_embed_present", fake_embed)
|
||||
out = asyncio.run(te.evaluate_tiered("C4", crits, {"hash": "h"}, _Doc("x" * 200)))
|
||||
assert out["status"] == "UNBESTIMMT"
|
||||
|
||||
|
||||
# ---- Reproduzierbarkeits-Cache -----------------------------------------
|
||||
def test_cache_roundtrip(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(te, "_CACHE_DB", str(tmp_path / "cache.db"))
|
||||
assert te._cache_get("k1") is None
|
||||
te._cache_put("k1", True)
|
||||
te._cache_put("k2", False)
|
||||
assert te._cache_get("k1") is True
|
||||
assert te._cache_get("k2") is False
|
||||
Reference in New Issue
Block a user