"""CONTENT-Pruefer / decision_method=EMBEDDING. Ist die Pflicht SEMANTISCH im Text vorhanden? Max-Cosinus (Doc-Chunks x Control- Paraphrasen) >= per-Control-Schwelle. Deterministisch (festes Embedding-Modell) und gecacht. Rettet Recall-FP (Klausel da, anders formuliert). Faellt der Embedding-Service aus, liefert der Checker present=None (unklar) — der Aufrufer behaelt dann das Keyword-Ergebnis (kein Hang, kein Crash). (Validiert an AGB: 17 Items, per-Item-Schwelle, 0 Fehl-Rescue.) """ from __future__ import annotations import asyncio import logging from .base import CheckResult, ControlSpec, DocContext, VerificationMethod logger = logging.getLogger(__name__) # Paraphrasen-Vektoren je Control einmal einbetten + cachen. _PARA_CACHE: dict[str, list] = {} class EmbeddingChecker: verification_method = VerificationMethod.CONTENT async def check(self, ctrl: ControlSpec, doc: DocContext) -> CheckResult: text = doc.text or "" paras = ctrl.paraphrases or [] thr = ctrl.embed_threshold if ctrl.embed_threshold is not None else 0.60 if not paras or len(text) < 100: return CheckResult(present=None, source="embedding") try: from compliance.services.mc_embedding_matcher import ( DIM, _chunk_text, _cosine, _embed_texts, ) if ctrl.control_id not in _PARA_CACHE: pv = await _embed_texts(paras) _PARA_CACHE[ctrl.control_id] = [v for v in pv if v and len(v) == DIM] pvecs = _PARA_CACHE[ctrl.control_id] chunks = _chunk_text(text) cvecs = [v for v in await asyncio.wait_for( _embed_texts(chunks), timeout=90.0) if v and len(v) == DIM] except (Exception, asyncio.TimeoutError) as e: logger.info("embedding checker inaktiv %s: %s", ctrl.control_id, str(e)[:80]) return CheckResult(present=None, source="embedding") if not pvecs or not cvecs: return CheckResult(present=None, source="embedding") best = max((_cosine(p, c) for p in pvecs for c in cvecs), default=0.0) return CheckResult(present=best >= thr, confidence=round(best, 3), source="embedding")