"""LLM Plausibility Re-Evaluation for MC findings. Why this exists: MC-DB labels are historic compliance-officer questions ("Dokumentiert die DSI alle Datenübermittlungen gemäß Art. 49 Abs. 1 Unterabs. 2 DS-GVO?"). When the deterministic regex+LLM-verify pipeline flags them as FAIL, the question stays as the title. The reader sees "we don't know" — unhelpful. What this does: AFTER the MC pipeline finished, run a second LLM pass over EVERY remaining FAIL with the original doc-text. The LLM: 1. Reformulates the question as a STATEMENT-OF-TOPIC ("Drittland-Übermittlungen nach Art. 49 Abs. 1 Unterabs. 2 DS-GVO") 2. Suggests a plausible severity (or DROP if the finding is bogus) 3. Produces a CONCRETE recommendation ("Im Abschnitt 'Drittland' der DSE Mechanismus pro Empfänger ergänzen") What this does NOT do: - Touch the MC-DB. Original label stays in c.label. - Touch passed/skipped/regulation/matched_text — those are facts. - Run for non-fails or already-handled checks. Stamping schema on each Check (CheckItem dataclass): llm_title: str — reformulated topic statement llm_severity: str — suggested severity ("HIGH"|"MED"|"LOW"|"DROP") llm_recommendation: str — concrete fix recommendation llm_drop: bool — True if the LLM judged the finding not plausible llm_plausibility: float — 0..1 confidence (optional) The mail-render V2 reads these stamps and renders them next to the original label (🤖 LLM-Plausibility box). Config: OLLAMA_URL default "http://host.docker.internal:11434" PLAUSIBILITY_LLM_MODEL default "qwen3:30b-a3b" PLAUSIBILITY_BATCH_SIZE default 8 PLAUSIBILITY_TIMEOUT_S default 60.0 """ from __future__ import annotations import hashlib import json import logging import os import httpx logger = logging.getLogger(__name__) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen3:30b-a3b") # Reduced from 8 → 4 to fight qwen3 empty-response-on-large-prompts bug. # 4 items × ~500 token/item + 2000 system + 1500 excerpt = ~5500 token total, # well within qwen3's safe range for format='json'. BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "4")) TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "45.0")) # Reduced excerpt 4000 → 1500 chars (same reason). DOC_EXCERPT_CHARS = int(os.getenv("PLAUSIBILITY_DOC_EXCERPT", "1500")) # In-memory cache: (input_hash) -> result_dict. Survives one run. _CACHE: dict[str, dict] = {} def _checksum(check_id: str, label: str, hint: str, doc_excerpt: str) -> str: """Stable hash of the LLM input — avoid re-asking on retries.""" h = hashlib.sha256() h.update(check_id.encode()) h.update(b"\x00") h.update(label.encode()) h.update(b"\x00") h.update(hint.encode()) h.update(b"\x00") h.update(doc_excerpt[:2000].encode()) return h.hexdigest()[:16] _SYSTEM_PROMPT = ( "Du bist Compliance-Plausibilitäts-Auditor für deutsche " "Datenschutz-Prüfberichte. Für jeden Finding-Eintrag bekommst du " "die MC-Pflichtfrage, den LLM-Hinweis und einen Ausschnitt aus " "dem geprüften Dokument.\n\n" "REGELN — sehr wichtig:\n" "1. Du gibst für JEDEN Finding-Eintrag im Input GENAU EINEN Output-" "Eintrag zurück (keine ausgelassen, keine zusätzlichen).\n" "2. Die ID muss BUCHSTABENGENAU vom Input übernommen werden — " "nicht abgekürzt, nicht umformatiert (Beispiel: \"mc-DATA-3953-A04\" " "bleibt \"mc-DATA-3953-A04\").\n" "3. Reihenfolge der Output-Items entspricht der Input-Reihenfolge.\n\n" "Pro Finding:\n" "- title: TOPIC-STATEMENT (max 80 Zeichen, ohne Frageton, " "nennt die Norm wenn sinnvoll). Beispiel: " "Frage \"Dokumentiert die DSI Drittlandtransfers nach Art. 49?\" " "→ title \"Drittlandtransfer-Doku Art. 49 DSGVO\".\n" "- severity: HIGH (klar verletzt), MEDIUM (verletzt, weniger " "kritisch), LOW (unsicher / manuelle Prüfung), DROP " "(Auszug zeigt klar dass die Anforderung erfüllt ist).\n" "- recommendation: KONKRETE Aktion (max 200 Zeichen), nennt " "WAS und WO. Beispiel: \"Im Abschnitt 'Drittlandtransfer' " "der DSE pro Empfänger einen Mechanismus nach Art. 49 ergänzen\".\n" "- drop: true wenn severity=DROP, sonst false.\n\n" "JSON-Schema (genauso antworten):\n" "{\"findings\":[" "{\"id\":\"\",\"title\":\"...\"," "\"severity\":\"HIGH|MEDIUM|LOW|DROP\"," "\"recommendation\":\"...\",\"drop\":false}" "]}\n\n" "Beispiel-Antwort bei 2 Inputs mit IDs mc-A und mc-B:\n" "{\"findings\":[{\"id\":\"mc-A\",\"title\":\"Norm X erfüllen\"," "\"severity\":\"MEDIUM\",\"recommendation\":\"In Abschnitt Y " "ergänzen: Norm X erfüllt\",\"drop\":false}," "{\"id\":\"mc-B\",\"title\":\"Norm Z geprüft\",\"severity\":\"DROP\"," "\"recommendation\":\"Bereits erfüllt — Hinweis im Doc Z3\"," "\"drop\":true}]}" ) def _build_user_prompt(items: list[dict], doc_title: str, doc_excerpt: str) -> str: findings_block = "\n".join( f'{i+1}. ID="{it["id"]}" | FRAGE: {it["label"]} | ' f'HINT: {it.get("hint", "")[:200]} | SEV_REGEX: {it.get("severity")}' for i, it in enumerate(items) ) return ( f"DOKUMENT: {doc_title}\n\n" f"DOKUMENT-AUSZUG (max {DOC_EXCERPT_CHARS} Zeichen):\n" f"{doc_excerpt[:DOC_EXCERPT_CHARS]}\n\n" f"FINDINGS ZU BEWERTEN:\n{findings_block}" ) async def _ask_llm_batch(items: list[dict], doc_title: str, doc_excerpt: str) -> dict[str, dict]: """Send a batch of up to BATCH_SIZE findings to the LLM.""" body = { "model": MODEL, "messages": [ {"role": "system", "content": _SYSTEM_PROMPT}, {"role": "user", "content": _build_user_prompt( items, doc_title, doc_excerpt, )}, ], "format": "json", "stream": False, "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500}, } out: dict[str, dict] = {} input_ids = [it["id"] for it in items] try: async with httpx.AsyncClient(timeout=TIMEOUT) as c: r = await c.post(f"{OLLAMA_URL}/api/chat", json=body) r.raise_for_status() content = (r.json().get("message") or {}).get("content", "") if not content: # Single retry with smaller batch — qwen3 sometimes # rejects ≥6-item prompts under format='json'. if len(items) > 2: half = len(items) // 2 logger.info( "plausibility empty → retry split %d → %dx2", len(items), half, ) first = await _ask_llm_batch( items[:half], doc_title, doc_excerpt, ) second = await _ask_llm_batch( items[half:], doc_title, doc_excerpt, ) out.update(first) out.update(second) return out logger.warning("plausibility LLM returned empty content") return out try: data = json.loads(content) except json.JSONDecodeError as je: logger.warning( "plausibility LLM JSON parse failed: %s; raw=%s", je, content[:300], ) return out llm_findings = data.get("findings") or [] if not llm_findings: logger.warning( "plausibility LLM returned 0 findings for %d input " "items; raw=%s", len(items), content[:300], ) return out # Phase 1: exact ID match id_set = set(input_ids) for entry in llm_findings: fid = (entry.get("id") or "").strip() if fid in id_set and fid not in out: out[fid] = _entry_to_stamp(entry) # Phase 2: position fallback — for any input item still # unmapped, use the LLM finding at the same index if it's # otherwise unclaimed. if len(out) < len(input_ids): claimed_indices: set[int] = set() for idx, entry in enumerate(llm_findings): fid = (entry.get("id") or "").strip() if fid in out: claimed_indices.add(idx) for idx, input_id in enumerate(input_ids): if input_id in out: continue if idx < len(llm_findings) and idx not in claimed_indices: out[input_id] = _entry_to_stamp(llm_findings[idx]) claimed_indices.add(idx) # Phase 3: fuzzy match by ID-tail if len(out) < len(input_ids): unmapped_ids = [i for i in input_ids if i not in out] used_entries: set[int] = set() for idx, entry in enumerate(llm_findings): fid = (entry.get("id") or "").strip().lower() if not fid or any(entry == out.get(i) for i in unmapped_ids): continue if idx in used_entries: continue for inp in unmapped_ids: if inp in out: continue if inp[-8:].lower() in fid or fid in inp.lower(): out[inp] = _entry_to_stamp(entry) used_entries.add(idx) break if not out: logger.warning( "plausibility could not map any of %d input IDs; " "raw=%s", len(input_ids), content[:300], ) else: logger.info( "plausibility mapped %d/%d findings", len(out), len(input_ids), ) except Exception as e: logger.warning("plausibility batch failed: %s", e) return out def _entry_to_stamp(entry: dict) -> dict: return { "llm_title": (entry.get("title") or "")[:200], "llm_severity": (entry.get("severity") or "").upper(), "llm_recommendation": (entry.get("recommendation") or "")[:400], "llm_drop": bool(entry.get("drop", False)), } async def verify_plausibility(results, doc_texts: dict[str, str]) -> None: """Stamp llm_* fields onto every FAIL CheckItem in results. Args: results: list of DocCheckResult, each with .checks (list of CheckItem) and .doc_type doc_texts: doc_type -> source text excerpt for context """ if not results: return # Gather candidate fails per doc_type so the prompt can scope the # excerpt correctly. by_doc: dict[str, list] = {} by_doc_meta: dict[str, str] = {} for r in results: dt = getattr(r, "doc_type", "") label = getattr(r, "label", "") or dt for c in getattr(r, "checks", []) or []: if getattr(c, "passed", True) or getattr(c, "skipped", False): continue # MC checks only — skip the structural P-* placement findings cid = (getattr(c, "id", "") or "").lower() if not cid.startswith("mc-"): continue by_doc.setdefault(dt, []).append(c) by_doc_meta[dt] = label if not by_doc: return total = sum(len(v) for v in by_doc.values()) logger.info("plausibility-check: %d findings across %d docs", total, len(by_doc)) for dt, checks in by_doc.items(): doc_title = by_doc_meta.get(dt) or dt doc_text = doc_texts.get(dt) or "" if not doc_text: # Fall back to DSE excerpt when the doc has no own text doc_text = doc_texts.get("dse") or "" for i in range(0, len(checks), BATCH_SIZE): batch = checks[i:i + BATCH_SIZE] items = [] for c in batch: items.append({ "id": getattr(c, "id", ""), "label": getattr(c, "label", ""), "hint": getattr(c, "hint", "") or "", "severity": getattr(c, "severity", ""), }) # Cache lookup per item — skip those already cached. uncached_items: list[dict] = [] for it in items: key = _checksum(it["id"], it["label"], it["hint"], doc_text) if key in _CACHE: continue uncached_items.append(it) if not uncached_items: cache_results = {it["id"]: _CACHE[_checksum( it["id"], it["label"], it["hint"], doc_text, )] for it in items} else: cache_results = await _ask_llm_batch( uncached_items, doc_title, doc_text, ) for it in uncached_items: rid = it["id"] if rid in cache_results: key = _checksum( it["id"], it["label"], it["hint"], doc_text, ) _CACHE[key] = cache_results[rid] # add cached ones too for it in items: if it["id"] not in cache_results: key = _checksum( it["id"], it["label"], it["hint"], doc_text, ) if key in _CACHE: cache_results[it["id"]] = _CACHE[key] # Stamp onto each CheckItem stamped = 0 for c in batch: cid = getattr(c, "id", "") if cid in cache_results: res = cache_results[cid] try: c.llm_title = res.get("llm_title", "") or "" sev = res.get("llm_severity", "") or "" c.llm_severity = sev if sev in ( "HIGH", "MEDIUM", "LOW", "DROP") else "" c.llm_recommendation = res.get( "llm_recommendation", "") or "" c.llm_drop = bool(res.get("llm_drop", False)) stamped += 1 except Exception: pass logger.info("plausibility-check %s: batch %d → %d stamped", dt, len(batch), stamped)