"""LLM Plausibility Re-Evaluation for MC findings. Why this exists: MC-DB labels are historic compliance-officer questions ("Dokumentiert die DSI alle Datenübermittlungen gemäß Art. 49 Abs. 1 Unterabs. 2 DS-GVO?"). When the deterministic regex+LLM-verify pipeline flags them as FAIL, the question stays as the title. The reader sees "we don't know" — unhelpful. What this does: AFTER the MC pipeline finished, run a second LLM pass over EVERY remaining FAIL with the original doc-text. The LLM: 1. Reformulates the question as a STATEMENT-OF-TOPIC ("Drittland-Übermittlungen nach Art. 49 Abs. 1 Unterabs. 2 DS-GVO") 2. Suggests a plausible severity (or DROP if the finding is bogus) 3. Produces a CONCRETE recommendation ("Im Abschnitt 'Drittland' der DSE Mechanismus pro Empfänger ergänzen") What this does NOT do: - Touch the MC-DB. Original label stays in c.label. - Touch passed/skipped/regulation/matched_text — those are facts. - Run for non-fails or already-handled checks. Stamping schema on each Check (CheckItem dataclass): llm_title: str — reformulated topic statement llm_severity: str — suggested severity ("HIGH"|"MED"|"LOW"|"DROP") llm_recommendation: str — concrete fix recommendation llm_drop: bool — True if the LLM judged the finding not plausible llm_plausibility: float — 0..1 confidence (optional) The mail-render V2 reads these stamps and renders them next to the original label (🤖 LLM-Plausibility box). Config: OLLAMA_URL default "http://host.docker.internal:11434" PLAUSIBILITY_LLM_MODEL default "qwen3:30b-a3b" PLAUSIBILITY_BATCH_SIZE default 8 PLAUSIBILITY_TIMEOUT_S default 60.0 """ from __future__ import annotations import hashlib import json import logging import os import httpx logger = logging.getLogger(__name__) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen3:30b-a3b") # Reduced from 8 → 4 to fight qwen3 empty-response-on-large-prompts bug. # 4 items × ~500 token/item + 2000 system + 1500 excerpt = ~5500 token total, # well within qwen3's safe range for format='json'. BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "4")) TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "45.0")) # Reduced excerpt 4000 → 1500 chars (same reason). DOC_EXCERPT_CHARS = int(os.getenv("PLAUSIBILITY_DOC_EXCERPT", "1500")) # In-memory cache: (input_hash) -> result_dict. Survives one run. _CACHE: dict[str, dict] = {} def _checksum(check_id: str, label: str, hint: str, doc_excerpt: str) -> str: """Stable hash of the LLM input — avoid re-asking on retries.""" h = hashlib.sha256() h.update(check_id.encode()) h.update(b"\x00") h.update(label.encode()) h.update(b"\x00") h.update(hint.encode()) h.update(b"\x00") h.update(doc_excerpt[:2000].encode()) return h.hexdigest()[:16] _SYSTEM_PROMPT = ( "Du bist Compliance-Plausibilitäts-Auditor für deutsche " "Datenschutz-Prüfberichte. Für jeden Finding-Eintrag bekommst du " "die MC-Pflichtfrage, den LLM-Hinweis und einen Ausschnitt aus " "dem geprüften Dokument.\n\n" "REGELN — sehr wichtig:\n" "1. Du gibst für JEDEN Finding-Eintrag im Input GENAU EINEN Output-" "Eintrag zurück (keine ausgelassen, keine zusätzlichen).\n" "2. Die ID muss BUCHSTABENGENAU vom Input übernommen werden — " "nicht abgekürzt, nicht umformatiert (Beispiel: \"mc-DATA-3953-A04\" " "bleibt \"mc-DATA-3953-A04\").\n" "3. Reihenfolge der Output-Items entspricht der Input-Reihenfolge.\n\n" "Pro Finding:\n" "- title: TOPIC-STATEMENT (max 80 Zeichen, ohne Frageton, " "nennt die Norm wenn sinnvoll). Beispiel: " "Frage \"Dokumentiert die DSI Drittlandtransfers nach Art. 49?\" " "→ title \"Drittlandtransfer-Doku Art. 49 DSGVO\".\n" "- severity: HIGH (klar verletzt), MEDIUM (verletzt, weniger " "kritisch), LOW (unsicher / manuelle Prüfung), DROP " "(Auszug zeigt klar dass die Anforderung erfüllt ist).\n" "- recommendation: KONKRETE Aktion (max 200 Zeichen), nennt " "WAS und WO. Beispiel: \"Im Abschnitt 'Drittlandtransfer' " "der DSE pro Empfänger einen Mechanismus nach Art. 49 ergänzen\".\n" "- drop: true wenn severity=DROP, sonst false.\n\n" "JSON-Schema (genauso antworten):\n" "{\"findings\":[" "{\"id\":\"\",\"title\":\"...\"," "\"severity\":\"HIGH|MEDIUM|LOW|DROP\"," "\"recommendation\":\"...\",\"drop\":false}" "]}\n\n" "Beispiel-Antwort bei 2 Inputs mit IDs mc-A und mc-B:\n" "{\"findings\":[{\"id\":\"mc-A\",\"title\":\"Norm X erfüllen\"," "\"severity\":\"MEDIUM\",\"recommendation\":\"In Abschnitt Y " "ergänzen: Norm X erfüllt\",\"drop\":false}," "{\"id\":\"mc-B\",\"title\":\"Norm Z geprüft\",\"severity\":\"DROP\"," "\"recommendation\":\"Bereits erfüllt — Hinweis im Doc Z3\"," "\"drop\":true}]}" ) def _build_user_prompt(items: list[dict], doc_title: str, doc_excerpt: str) -> str: findings_block = "\n".join( f'{i+1}. ID="{it["id"]}" | FRAGE: {it["label"]} | ' f'HINT: {it.get("hint", "")[:200]} | SEV_REGEX: {it.get("severity")}' for i, it in enumerate(items) ) return ( f"DOKUMENT: {doc_title}\n\n" f"DOKUMENT-AUSZUG (max {DOC_EXCERPT_CHARS} Zeichen):\n" f"{doc_excerpt[:DOC_EXCERPT_CHARS]}\n\n" f"FINDINGS ZU BEWERTEN:\n{findings_block}" ) async def _post_llm(body: dict) -> str: """One LLM call. Returns content string or empty on failure. Catches network errors so the caller can decide fallback strategy.""" try: async with httpx.AsyncClient(timeout=TIMEOUT) as c: r = await c.post(f"{OLLAMA_URL}/api/chat", json=body) r.raise_for_status() return (r.json().get("message") or {}).get("content", "") or "" except Exception as e: logger.warning("plausibility LLM call failed: %s", e) return "" def _try_extract_json(content: str) -> dict | None: """Extract a JSON object from free-form LLM output. Handles markdown-fenced and prose-wrapped responses.""" if not content: return None s = content.strip() # Strip ```json … ``` fences if s.startswith("```"): s = s.strip("`") if s.lower().startswith("json"): s = s[4:] s = s.strip() # Heuristic: cut from first { to last } first = s.find("{") last = s.rfind("}") if first >= 0 and last > first: s = s[first:last + 1] try: return json.loads(s) except Exception: return None async def _ask_llm_batch(items: list[dict], doc_title: str, doc_excerpt: str) -> dict[str, dict]: """Send a batch of up to BATCH_SIZE findings to the LLM. Resilience strategy (P125 fix for empty-response bug): A. format='json' (strict) — current default B. If A returns empty: format='' (loose), extract JSON manually C. If B also empty AND batch >2: split batch + recurse D. Else: give up, return {} (callers stamp llm_skipped=true) """ user_prompt = _build_user_prompt(items, doc_title, doc_excerpt) base_body = { "model": MODEL, "messages": [ {"role": "system", "content": _SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}, ], "stream": False, "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500}, } out: dict[str, dict] = {} input_ids = [it["id"] for it in items] try: # Strategy A: format='json' content = await _post_llm({**base_body, "format": "json"}) if not content: # Strategy B: format-free, parse-on-our-side logger.info( "plausibility A→empty, trying B (format-free) batch=%d", len(items), ) content = await _post_llm(base_body) if not content: # Strategy C: split + recurse if len(items) > 2: half = len(items) // 2 logger.info( "plausibility A+B empty → split %d → %dx2", len(items), half, ) first = await _ask_llm_batch( items[:half], doc_title, doc_excerpt, ) second = await _ask_llm_batch( items[half:], doc_title, doc_excerpt, ) out.update(first) out.update(second) return out # Strategy D: give up logger.warning( "plausibility gave up after A+B for batch=%d", len(items), ) return out data = _try_extract_json(content) if data is None: logger.warning( "plausibility LLM JSON parse failed (after fallback); " "raw=%s", content[:300], ) return out llm_findings = data.get("findings") or [] if not llm_findings: logger.warning( "plausibility LLM returned 0 findings for %d input " "items; raw=%s", len(items), content[:300], ) return out # Phase 1: exact ID match id_set = set(input_ids) for entry in llm_findings: fid = (entry.get("id") or "").strip() if fid in id_set and fid not in out: out[fid] = _entry_to_stamp(entry) # Phase 2: position fallback — for any input item still # unmapped, use the LLM finding at the same index if it's # otherwise unclaimed. if len(out) < len(input_ids): claimed_indices: set[int] = set() for idx, entry in enumerate(llm_findings): fid = (entry.get("id") or "").strip() if fid in out: claimed_indices.add(idx) for idx, input_id in enumerate(input_ids): if input_id in out: continue if idx < len(llm_findings) and idx not in claimed_indices: out[input_id] = _entry_to_stamp(llm_findings[idx]) claimed_indices.add(idx) # Phase 3: fuzzy match by ID-tail if len(out) < len(input_ids): unmapped_ids = [i for i in input_ids if i not in out] used_entries: set[int] = set() for idx, entry in enumerate(llm_findings): fid = (entry.get("id") or "").strip().lower() if not fid or any(entry == out.get(i) for i in unmapped_ids): continue if idx in used_entries: continue for inp in unmapped_ids: if inp in out: continue if inp[-8:].lower() in fid or fid in inp.lower(): out[inp] = _entry_to_stamp(entry) used_entries.add(idx) break if not out: logger.warning( "plausibility could not map any of %d input IDs; " "raw=%s", len(input_ids), content[:300], ) else: logger.info( "plausibility mapped %d/%d findings", len(out), len(input_ids), ) except Exception as e: logger.warning("plausibility batch failed: %s", e) return out def _entry_to_stamp(entry: dict) -> dict: return { "llm_title": (entry.get("title") or "")[:200], "llm_severity": (entry.get("severity") or "").upper(), "llm_recommendation": (entry.get("recommendation") or "")[:400], "llm_drop": bool(entry.get("drop", False)), } async def verify_plausibility(results, doc_texts: dict[str, str]) -> None: """Stamp llm_* fields onto every FAIL CheckItem in results. Args: results: list of DocCheckResult, each with .checks (list of CheckItem) and .doc_type doc_texts: doc_type -> source text excerpt for context """ if not results: return # Gather candidate fails per doc_type so the prompt can scope the # excerpt correctly. by_doc: dict[str, list] = {} by_doc_meta: dict[str, str] = {} for r in results: dt = getattr(r, "doc_type", "") label = getattr(r, "label", "") or dt for c in getattr(r, "checks", []) or []: if getattr(c, "passed", True) or getattr(c, "skipped", False): continue # MC checks only — skip the structural P-* placement findings cid = (getattr(c, "id", "") or "").lower() if not cid.startswith("mc-"): continue by_doc.setdefault(dt, []).append(c) by_doc_meta[dt] = label if not by_doc: return total = sum(len(v) for v in by_doc.values()) logger.info("plausibility-check: %d findings across %d docs", total, len(by_doc)) # Circuit-Breaker gegen Ollama-Total-Down: nach N consecutive # batches mit 0 stamped → ganze Phase abbrechen (statt 200 calls # warten). Wert konservativ: 6 consecutive empties = qwen3 ist # offensichtlich nicht in der Lage zu antworten. consecutive_empty_budget = int( os.getenv("PLAUSIBILITY_EMPTY_BUDGET", "6"), ) consecutive_empty = 0 breaker_tripped = False for dt, checks in by_doc.items(): if breaker_tripped: break doc_title = by_doc_meta.get(dt) or dt doc_text = doc_texts.get(dt) or "" if not doc_text: # Fall back to DSE excerpt when the doc has no own text doc_text = doc_texts.get("dse") or "" for i in range(0, len(checks), BATCH_SIZE): if breaker_tripped: break batch = checks[i:i + BATCH_SIZE] items = [] for c in batch: items.append({ "id": getattr(c, "id", ""), "label": getattr(c, "label", ""), "hint": getattr(c, "hint", "") or "", "severity": getattr(c, "severity", ""), }) # Cache lookup per item — skip those already cached. uncached_items: list[dict] = [] for it in items: key = _checksum(it["id"], it["label"], it["hint"], doc_text) if key in _CACHE: continue uncached_items.append(it) if not uncached_items: cache_results = {it["id"]: _CACHE[_checksum( it["id"], it["label"], it["hint"], doc_text, )] for it in items} else: cache_results = await _ask_llm_batch( uncached_items, doc_title, doc_text, ) for it in uncached_items: rid = it["id"] if rid in cache_results: key = _checksum( it["id"], it["label"], it["hint"], doc_text, ) _CACHE[key] = cache_results[rid] # add cached ones too for it in items: if it["id"] not in cache_results: key = _checksum( it["id"], it["label"], it["hint"], doc_text, ) if key in _CACHE: cache_results[it["id"]] = _CACHE[key] # Stamp onto each CheckItem stamped = 0 for c in batch: cid = getattr(c, "id", "") if cid in cache_results: res = cache_results[cid] try: c.llm_title = res.get("llm_title", "") or "" sev = res.get("llm_severity", "") or "" c.llm_severity = sev if sev in ( "HIGH", "MEDIUM", "LOW", "DROP") else "" c.llm_recommendation = res.get( "llm_recommendation", "") or "" c.llm_drop = bool(res.get("llm_drop", False)) stamped += 1 except Exception: pass # Circuit-Breaker: stamped=0 zählt als consecutive_empty. # Ausnahme: wenn ALLE items aus dem _CACHE kamen, ist 0 OK # (kein neuer LLM-Call gemacht). if uncached_items and stamped == 0: consecutive_empty += 1 if consecutive_empty >= consecutive_empty_budget: logger.warning( "plausibility circuit-breaker tripped after " "%d consecutive empty batches — aborting phase", consecutive_empty, ) breaker_tripped = True elif stamped > 0: consecutive_empty = 0 logger.info("plausibility-check %s: batch %d → %d stamped", dt, len(batch), stamped)