38a347a82a
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 9s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 24s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m11s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 24s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
AGB v2 (decision_method routing, 71%FP->~0) + DSE v3 (4-layer, recovered from container) + Architektur-Tab into /sdk/agent live path. Incl CI robustness (detect-changes.sh + PR-head checkout) + security (hardcoded Qdrant key removed, gitleaks allowlist). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
68 lines
2.6 KiB
Python
68 lines
2.6 KiB
Python
"""DSE Embedding-Recall — deterministische semantische Schicht (gecacht).
|
|
|
|
Testet die reine Logik OHNE Embedding-Service: Cache-Treffer-Pfad,
|
|
Schwellen-Filter, Kandidaten-Schnitt, Reachability-Guard. Das Einbetten selbst
|
|
(Embedding-Service) ist Integration und wird auf macmini/Prod validiert.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
|
|
import compliance.services.specialist_agents.dse._embedding_recall as er
|
|
|
|
|
|
_TEXT = ("Datenschutzerklaerung der Muster GmbH. " * 20) # > 100 Zeichen
|
|
|
|
|
|
def _seed_cache(tmp_path, scores: dict[str, float]) -> str:
|
|
p = tmp_path / "dse_embed_cache.json"
|
|
p.write_text(json.dumps({er._doc_hash(_TEXT): scores}))
|
|
return str(p)
|
|
|
|
|
|
def test_doc_hash_deterministic():
|
|
# feste Funktion: gleicher Text → gleicher Hash (Reproduzierbarkeit)
|
|
assert er._doc_hash(_TEXT) == er._doc_hash(_TEXT)
|
|
assert er._doc_hash("a") != er._doc_hash("b")
|
|
|
|
|
|
def test_cache_hit_threshold_filter(tmp_path, monkeypatch):
|
|
# Cache-Treffer: kein Embedding-Service nötig. Nur Scores >= Schwelle UND
|
|
# in den Kandidaten werden zurückgegeben.
|
|
scores = {"DATA-1": 0.71, "DATA-2": 0.60, "AUTH-3": 0.68, "SEC-4": 0.50}
|
|
monkeypatch.setenv("DSE_EMBED_CACHE", _seed_cache(tmp_path, scores))
|
|
monkeypatch.setattr(er, "_CACHE_PATH", str(tmp_path / "dse_embed_cache.json"))
|
|
|
|
cands = ["DATA-1", "DATA-2", "AUTH-3", "SEC-4"]
|
|
out = asyncio.run(er.embedding_recall(_TEXT, cands, threshold=0.65))
|
|
# >=0.65: DATA-1 (0.71), AUTH-3 (0.68). NICHT DATA-2 (0.60), SEC-4 (0.50).
|
|
assert out == {"DATA-1", "AUTH-3"}
|
|
|
|
|
|
def test_cache_hit_candidate_intersection(tmp_path, monkeypatch):
|
|
# Nur Kandidaten (durchgefallene Controls) zählen — andere ignoriert.
|
|
scores = {"DATA-1": 0.90, "DATA-2": 0.90}
|
|
monkeypatch.setattr(er, "_CACHE_PATH", str(tmp_path / "c.json"))
|
|
(tmp_path / "c.json").write_text(json.dumps({er._doc_hash(_TEXT): scores}))
|
|
out = asyncio.run(er.embedding_recall(_TEXT, ["DATA-1"], threshold=0.65))
|
|
assert out == {"DATA-1"} # DATA-2 nicht in Kandidaten
|
|
|
|
|
|
def test_empty_inputs():
|
|
assert asyncio.run(er.embedding_recall("zu kurz", ["X"])) == set()
|
|
assert asyncio.run(er.embedding_recall(_TEXT, [])) == set()
|
|
|
|
|
|
def test_service_down_returns_empty(tmp_path, monkeypatch):
|
|
# Kein Cache + Service nicht erreichbar → leer (deterministischer Layer trägt),
|
|
# KEIN Hang.
|
|
monkeypatch.setattr(er, "_CACHE_PATH", str(tmp_path / "none.json"))
|
|
|
|
async def _unreachable(timeout=2.0):
|
|
return False
|
|
monkeypatch.setattr(er, "_embedding_reachable", _unreachable)
|
|
out = asyncio.run(er.embedding_recall(_TEXT, ["DATA-1"]))
|
|
assert out == set()
|