breakpilot-compliance/backend-compliance/compliance/services/finding_plausibility_check.py

"""LLM Plausibility Re-Evaluation for MC findings.

Why this exists:
  MC-DB labels are historic compliance-officer questions ("Dokumentiert
  die DSI alle Datenübermittlungen gemäß Art. 49 Abs. 1 Unterabs. 2
  DS-GVO?"). When the deterministic regex+LLM-verify pipeline flags
  them as FAIL, the question stays as the title. The reader sees
  "we don't know" — unhelpful.

What this does:
  AFTER the MC pipeline finished, run a second LLM pass over EVERY
  remaining FAIL with the original doc-text. The LLM:
    1. Reformulates the question as a STATEMENT-OF-TOPIC
       ("Drittland-Übermittlungen nach Art. 49 Abs. 1 Unterabs. 2 DS-GVO")
    2. Suggests a plausible severity (or DROP if the finding is bogus)
    3. Produces a CONCRETE recommendation ("Im Abschnitt 'Drittland'
       der DSE Mechanismus pro Empfänger ergänzen")

What this does NOT do:
  - Touch the MC-DB. Original label stays in c.label.
  - Touch passed/skipped/regulation/matched_text — those are facts.
  - Run for non-fails or already-handled checks.

Stamping schema on each Check (CheckItem dataclass):
  llm_title: str          — reformulated topic statement
  llm_severity: str       — suggested severity ("HIGH"|"MED"|"LOW"|"DROP")
  llm_recommendation: str — concrete fix recommendation
  llm_drop: bool          — True if the LLM judged the finding not plausible
  llm_plausibility: float — 0..1 confidence (optional)

The mail-render V2 reads these stamps and renders them next to the
original label (🤖 LLM-Plausibility box).

Config:
  OLLAMA_URL                 default "http://host.docker.internal:11434"
  PLAUSIBILITY_LLM_MODEL     default "qwen3:30b-a3b"
  PLAUSIBILITY_BATCH_SIZE    default 8
  PLAUSIBILITY_TIMEOUT_S     default 60.0
"""

from __future__ import annotations

import hashlib
import json
import logging
import os

import httpx

logger = logging.getLogger(__name__)

OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
# Default-Modell als ENV-Switch konfigurierbar. qwen3:30b-a3b ist
# bestes Reasoning, aber gibt bei großen DSEs gerne leere Responses
# unter format='json'. qwen2.5:7b ist 4× kleiner, deutlich
# zuverlässiger, leicht schwächeres Reasoning aber für die einfache
# Plausibility-Klassifikation (PASS/MODIFY/DROP) ausreichend.
MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen2.5:7b")
# Fallback-Modell wenn das primary trotz Retries nichts liefert
# (Strategy A → B → C → D-Schritte erschöpft). Default ist ein
# kleines, robustes Modell.
FALLBACK_MODEL = os.getenv("PLAUSIBILITY_FALLBACK_MODEL", "llama3.2:3b")
# Mit kleinerem Modell können größere Batches funktionieren — aber
# konservativ bleiben damit Single-Modell-Fail nicht ganz Phase killt.
BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "3"))
TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "45.0"))
# Reduced excerpt 4000 → 1500 chars (same reason).
DOC_EXCERPT_CHARS = int(os.getenv("PLAUSIBILITY_DOC_EXCERPT", "1500"))

# In-memory cache: (input_hash) -> result_dict. Survives one run.
_CACHE: dict[str, dict] = {}


def _checksum(check_id: str, label: str, hint: str,
              doc_excerpt: str) -> str:
    """Stable hash of the LLM input — avoid re-asking on retries."""
    h = hashlib.sha256()
    h.update(check_id.encode())
    h.update(b"\x00")
    h.update(label.encode())
    h.update(b"\x00")
    h.update(hint.encode())
    h.update(b"\x00")
    h.update(doc_excerpt[:2000].encode())
    return h.hexdigest()[:16]


_SYSTEM_PROMPT = (
    "Du bist Compliance-Plausibilitäts-Auditor für deutsche "
    "Datenschutz-Prüfberichte. Für jeden Finding-Eintrag bekommst du "
    "die MC-Pflichtfrage, den LLM-Hinweis und einen Ausschnitt aus "
    "dem geprüften Dokument.\n\n"
    "REGELN — sehr wichtig:\n"
    "1. Du gibst für JEDEN Finding-Eintrag im Input GENAU EINEN Output-"
    "Eintrag zurück (keine ausgelassen, keine zusätzlichen).\n"
    "2. Die ID muss BUCHSTABENGENAU vom Input übernommen werden — "
    "nicht abgekürzt, nicht umformatiert (Beispiel: \"mc-DATA-3953-A04\" "
    "bleibt \"mc-DATA-3953-A04\").\n"
    "3. Reihenfolge der Output-Items entspricht der Input-Reihenfolge.\n\n"
    "Pro Finding:\n"
    "- title: TOPIC-STATEMENT (max 80 Zeichen, ohne Frageton, "
    "nennt die Norm wenn sinnvoll). Beispiel: "
    "Frage \"Dokumentiert die DSI Drittlandtransfers nach Art. 49?\" "
    "→ title \"Drittlandtransfer-Doku Art. 49 DSGVO\".\n"
    "- severity: HIGH (klar verletzt), MEDIUM (verletzt, weniger "
    "kritisch), LOW (unsicher / manuelle Prüfung), DROP "
    "(Auszug zeigt klar dass die Anforderung erfüllt ist).\n"
    "- recommendation: KONKRETE Aktion (max 200 Zeichen), nennt "
    "WAS und WO. Beispiel: \"Im Abschnitt 'Drittlandtransfer' "
    "der DSE pro Empfänger einen Mechanismus nach Art. 49 ergänzen\".\n"
    "- drop: true wenn severity=DROP, sonst false.\n\n"
    "JSON-Schema (genauso antworten):\n"
    "{\"findings\":["
    "{\"id\":\"<exakte-id-vom-input>\",\"title\":\"...\","
    "\"severity\":\"HIGH|MEDIUM|LOW|DROP\","
    "\"recommendation\":\"...\",\"drop\":false}"
    "]}\n\n"
    "Beispiel-Antwort bei 2 Inputs mit IDs mc-A und mc-B:\n"
    "{\"findings\":[{\"id\":\"mc-A\",\"title\":\"Norm X erfüllen\","
    "\"severity\":\"MEDIUM\",\"recommendation\":\"In Abschnitt Y "
    "ergänzen: Norm X erfüllt\",\"drop\":false},"
    "{\"id\":\"mc-B\",\"title\":\"Norm Z geprüft\",\"severity\":\"DROP\","
    "\"recommendation\":\"Bereits erfüllt — Hinweis im Doc Z3\","
    "\"drop\":true}]}"
)


def _build_user_prompt(items: list[dict], doc_title: str,
                        doc_excerpt: str) -> str:
    findings_block = "\n".join(
        f'{i+1}. ID="{it["id"]}" | FRAGE: {it["label"]} | '
        f'HINT: {it.get("hint", "")[:200]} | SEV_REGEX: {it.get("severity")}'
        for i, it in enumerate(items)
    )
    return (
        f"DOKUMENT: {doc_title}\n\n"
        f"DOKUMENT-AUSZUG (max {DOC_EXCERPT_CHARS} Zeichen):\n"
        f"{doc_excerpt[:DOC_EXCERPT_CHARS]}\n\n"
        f"FINDINGS ZU BEWERTEN:\n{findings_block}"
    )


async def _post_llm(body: dict) -> str:
    """One LLM call. Returns content string or empty on failure.
    Catches network errors so the caller can decide fallback strategy."""
    try:
        async with httpx.AsyncClient(timeout=TIMEOUT) as c:
            r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
            r.raise_for_status()
            return (r.json().get("message") or {}).get("content", "") or ""
    except Exception as e:
        logger.warning("plausibility LLM call failed: %s", e)
        return ""


def _try_extract_json(content: str) -> dict | None:
    """Extract a JSON object from free-form LLM output. Handles
    markdown-fenced and prose-wrapped responses."""
    if not content:
        return None
    s = content.strip()
    # Strip ```json … ``` fences
    if s.startswith("```"):
        s = s.strip("`")
        if s.lower().startswith("json"):
            s = s[4:]
        s = s.strip()
    # Heuristic: cut from first { to last }
    first = s.find("{")
    last = s.rfind("}")
    if first >= 0 and last > first:
        s = s[first:last + 1]
    try:
        return json.loads(s)
    except Exception:
        return None


async def _ask_llm_batch(items: list[dict], doc_title: str,
                          doc_excerpt: str) -> dict[str, dict]:
    """Send a batch of up to BATCH_SIZE findings to the LLM.

    Resilience strategy (P125 fix for empty-response bug):
      A. primary MODEL + format='json' (strict)
      B. primary MODEL + format='' (loose), parse JSON manuell
      C. FALLBACK_MODEL + format='json' (kleineres robusteres Modell)
      D. If batch >2: split + recurse
      E. Else: give up, return {} (callers stamp llm_skipped=true)
    """
    user_prompt = _build_user_prompt(items, doc_title, doc_excerpt)

    def _body(model: str) -> dict:
        return {
            "model": model,
            "messages": [
                {"role": "system", "content": _SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt},
            ],
            "stream": False,
            "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
        }

    out: dict[str, dict] = {}
    input_ids = [it["id"] for it in items]
    try:
        # Strategy A: primary + format='json'
        content = await _post_llm({**_body(MODEL), "format": "json"})
        if not content:
            # Strategy B: primary + format-free
            logger.info(
                "plausibility A→empty, trying B (format-free) batch=%d",
                len(items),
            )
            content = await _post_llm(_body(MODEL))
        if not content and FALLBACK_MODEL and FALLBACK_MODEL != MODEL:
            # Strategy C: fallback-model + format='json'
            logger.info(
                "plausibility A+B empty, trying C (fallback=%s) batch=%d",
                FALLBACK_MODEL, len(items),
            )
            content = await _post_llm(
                {**_body(FALLBACK_MODEL), "format": "json"},
            )

        if not content:
            # Strategy C: split + recurse
            if len(items) > 2:
                half = len(items) // 2
                logger.info(
                    "plausibility A+B empty → split %d → %dx2",
                    len(items), half,
                )
                first = await _ask_llm_batch(
                    items[:half], doc_title, doc_excerpt,
                )
                second = await _ask_llm_batch(
                    items[half:], doc_title, doc_excerpt,
                )
                out.update(first)
                out.update(second)
                return out
            # Strategy D: give up
            logger.warning(
                "plausibility gave up after A+B for batch=%d", len(items),
            )
            return out
            data = _try_extract_json(content)
            if data is None:
                logger.warning(
                    "plausibility LLM JSON parse failed (after fallback); "
                    "raw=%s", content[:300],
                )
                return out
            llm_findings = data.get("findings") or []
            if not llm_findings:
                logger.warning(
                    "plausibility LLM returned 0 findings for %d input "
                    "items; raw=%s", len(items), content[:300],
                )
                return out
            # Phase 1: exact ID match
            id_set = set(input_ids)
            for entry in llm_findings:
                fid = (entry.get("id") or "").strip()
                if fid in id_set and fid not in out:
                    out[fid] = _entry_to_stamp(entry)
            # Phase 2: position fallback — for any input item still
            # unmapped, use the LLM finding at the same index if it's
            # otherwise unclaimed.
            if len(out) < len(input_ids):
                claimed_indices: set[int] = set()
                for idx, entry in enumerate(llm_findings):
                    fid = (entry.get("id") or "").strip()
                    if fid in out:
                        claimed_indices.add(idx)
                for idx, input_id in enumerate(input_ids):
                    if input_id in out:
                        continue
                    if idx < len(llm_findings) and idx not in claimed_indices:
                        out[input_id] = _entry_to_stamp(llm_findings[idx])
                        claimed_indices.add(idx)
            # Phase 3: fuzzy match by ID-tail
            if len(out) < len(input_ids):
                unmapped_ids = [i for i in input_ids if i not in out]
                used_entries: set[int] = set()
                for idx, entry in enumerate(llm_findings):
                    fid = (entry.get("id") or "").strip().lower()
                    if not fid or any(entry == out.get(i) for i in unmapped_ids):
                        continue
                    if idx in used_entries:
                        continue
                    for inp in unmapped_ids:
                        if inp in out:
                            continue
                        if inp[-8:].lower() in fid or fid in inp.lower():
                            out[inp] = _entry_to_stamp(entry)
                            used_entries.add(idx)
                            break
            if not out:
                logger.warning(
                    "plausibility could not map any of %d input IDs; "
                    "raw=%s", len(input_ids), content[:300],
                )
            else:
                logger.info(
                    "plausibility mapped %d/%d findings", len(out),
                    len(input_ids),
                )
    except Exception as e:
        logger.warning("plausibility batch failed: %s", e)
    return out


def _entry_to_stamp(entry: dict) -> dict:
    return {
        "llm_title": (entry.get("title") or "")[:200],
        "llm_severity": (entry.get("severity") or "").upper(),
        "llm_recommendation": (entry.get("recommendation") or "")[:400],
        "llm_drop": bool(entry.get("drop", False)),
    }


async def verify_plausibility(results, doc_texts: dict[str, str]) -> None:
    """Stamp llm_* fields onto every FAIL CheckItem in results.

    Args:
      results: list of DocCheckResult, each with .checks (list of CheckItem)
               and .doc_type
      doc_texts: doc_type -> source text excerpt for context
    """
    if not results:
        return
    # Gather candidate fails per doc_type so the prompt can scope the
    # excerpt correctly.
    by_doc: dict[str, list] = {}
    by_doc_meta: dict[str, str] = {}
    for r in results:
        dt = getattr(r, "doc_type", "")
        label = getattr(r, "label", "") or dt
        for c in getattr(r, "checks", []) or []:
            if getattr(c, "passed", True) or getattr(c, "skipped", False):
                continue
            # MC checks only — skip the structural P-* placement findings
            cid = (getattr(c, "id", "") or "").lower()
            if not cid.startswith("mc-"):
                continue
            by_doc.setdefault(dt, []).append(c)
            by_doc_meta[dt] = label

    if not by_doc:
        return

    total = sum(len(v) for v in by_doc.values())
    logger.info("plausibility-check: %d findings across %d docs",
                total, len(by_doc))

    # Circuit-Breaker gegen Ollama-Total-Down: nach N consecutive
    # batches mit 0 stamped → ganze Phase abbrechen (statt 200 calls
    # warten). Wert konservativ: 6 consecutive empties = qwen3 ist
    # offensichtlich nicht in der Lage zu antworten.
    consecutive_empty_budget = int(
        os.getenv("PLAUSIBILITY_EMPTY_BUDGET", "6"),
    )
    consecutive_empty = 0
    breaker_tripped = False

    for dt, checks in by_doc.items():
        if breaker_tripped:
            break
        doc_title = by_doc_meta.get(dt) or dt
        doc_text = doc_texts.get(dt) or ""
        if not doc_text:
            # Fall back to DSE excerpt when the doc has no own text
            doc_text = doc_texts.get("dse") or ""
        for i in range(0, len(checks), BATCH_SIZE):
            if breaker_tripped:
                break
            batch = checks[i:i + BATCH_SIZE]
            items = []
            for c in batch:
                items.append({
                    "id": getattr(c, "id", ""),
                    "label": getattr(c, "label", ""),
                    "hint": getattr(c, "hint", "") or "",
                    "severity": getattr(c, "severity", ""),
                })
            # Cache lookup per item — skip those already cached.
            uncached_items: list[dict] = []
            for it in items:
                key = _checksum(it["id"], it["label"], it["hint"], doc_text)
                if key in _CACHE:
                    continue
                uncached_items.append(it)
            if not uncached_items:
                cache_results = {it["id"]: _CACHE[_checksum(
                    it["id"], it["label"], it["hint"], doc_text,
                )] for it in items}
            else:
                cache_results = await _ask_llm_batch(
                    uncached_items, doc_title, doc_text,
                )
                for it in uncached_items:
                    rid = it["id"]
                    if rid in cache_results:
                        key = _checksum(
                            it["id"], it["label"], it["hint"], doc_text,
                        )
                        _CACHE[key] = cache_results[rid]
                # add cached ones too
                for it in items:
                    if it["id"] not in cache_results:
                        key = _checksum(
                            it["id"], it["label"], it["hint"], doc_text,
                        )
                        if key in _CACHE:
                            cache_results[it["id"]] = _CACHE[key]
            # Stamp onto each CheckItem
            stamped = 0
            for c in batch:
                cid = getattr(c, "id", "")
                if cid in cache_results:
                    res = cache_results[cid]
                    try:
                        c.llm_title = res.get("llm_title", "") or ""
                        sev = res.get("llm_severity", "") or ""
                        c.llm_severity = sev if sev in (
                            "HIGH", "MEDIUM", "LOW", "DROP") else ""
                        c.llm_recommendation = res.get(
                            "llm_recommendation", "") or ""
                        c.llm_drop = bool(res.get("llm_drop", False))
                        stamped += 1
                    except Exception:
                        pass
            # Circuit-Breaker: stamped=0 zählt als consecutive_empty.
            # Ausnahme: wenn ALLE items aus dem _CACHE kamen, ist 0 OK
            # (kein neuer LLM-Call gemacht).
            if uncached_items and stamped == 0:
                consecutive_empty += 1
                if consecutive_empty >= consecutive_empty_budget:
                    logger.warning(
                        "plausibility circuit-breaker tripped after "
                        "%d consecutive empty batches — aborting phase",
                        consecutive_empty,
                    )
                    breaker_tripped = True
            elif stamped > 0:
                consecutive_empty = 0
            logger.info("plausibility-check %s: batch %d → %d stamped",
                        dt, len(batch), stamped)