breakpilot-compliance/backend-compliance/compliance/services/doc_checks/llm_verify.py

"""
LLM verification for regex check results.

When a regex check FAILs, the LLM re-checks the original text
to confirm or overturn the finding. This eliminates false positives
caused by regex limitations (unusual formatting, synonyms, etc.).

Uses the self-hosted Ollama endpoint (Qwen) for fast local inference.
"""

import logging
import os
import httpx

logger = logging.getLogger(__name__)

OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_VERIFY_MODEL", "qwen3.5:35b-a3b")
TIMEOUT = 30.0


async def verify_failed_checks(
    text: str,
    failed_checks: list[dict],
    doc_title: str,
) -> dict[str, dict]:
    """Verify regex FAIL results using LLM — single batched call.

    Sends ALL failed checks in one LLM prompt instead of one call per check.
    Returns a dict mapping check_id -> {"overturned": bool, "evidence": str}.
    """
    results: dict[str, dict] = {}

    checks_with_hints = [c for c in failed_checks if c.get("hint")]
    if not checks_with_hints:
        return results

    # Truncate text to fit context window
    text_excerpt = text[:8000]

    try:
        batch_results = await _ask_llm_batch(
            text_excerpt, checks_with_hints, doc_title,
        )
        for check_id, answer in batch_results.items():
            overturned = answer.get("found", False)
            results[check_id] = {
                "overturned": overturned,
                "evidence": answer.get("evidence", ""),
            }
            if overturned:
                logger.info(
                    "LLM overturned regex FAIL for '%s' in '%s': %s",
                    check_id, doc_title, answer.get("evidence", "")[:80],
                )
    except Exception as e:
        logger.warning("LLM batch verify failed for '%s': %s", doc_title, e)

    return results


async def _ask_llm_batch(
    text: str, checks: list[dict], doc_title: str,
) -> dict[str, dict]:
    """Ask the LLM to verify ALL failed checks in a single call.

    Uses /api/chat with format='json' so Ollama enforces a valid JSON
    response object — much more reliable than the previous /api/generate
    + free-text approach which qwen3 often wrapped in <think>...</think>
    reasoning tokens.
    """
    checklist_lines = []
    for i, c in enumerate(checks, 1):
        checklist_lines.append(
            f'{i}. ID="{c["id"]}" | {c["label"]} | {c.get("hint", "")[:120]}'
        )
    checklist_str = "\n".join(checklist_lines)

    system_msg = (
        "Du pruefst ob ein Dokument bestimmte Pflichtangaben enthaelt. "
        "Antworte AUSSCHLIESSLICH mit einem JSON-Objekt: "
        '{"results": [{"id": "<check-id>", "found": true|false, '
        '"evidence": "<kurzes Zitat oder leer>"}]}. '
        "Keine Erklaerungen, keine Reasoning-Tags, kein Markdown."
    )
    user_msg = (
        f'DOKUMENT: "{doc_title}"\n\n'
        f"ANFORDERUNGEN:\n{checklist_str}\n\n"
        f"TEXT:\n{text}"
    )

    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        "stream": False,
        "format": "json",  # forces valid JSON output
        "options": {"temperature": 0.0, "num_predict": 3000},
    }

    async with httpx.AsyncClient(timeout=120.0) as client:
        resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
        resp.raise_for_status()
        data = resp.json()
        raw = (data.get("message") or {}).get("content", "")

    return _parse_batch_response(raw, checks)


def _parse_batch_response(raw: str, checks: list[dict]) -> dict[str, dict]:
    """Parse batch LLM response. Tolerates <think>…</think> wrappers,
    code-fences, and either {results: [...]} or top-level [...]."""
    import json
    import re

    results: dict[str, dict] = {}
    if not raw:
        logger.info("LLM batch: empty response from model")
        return results

    text = raw.strip()
    # Strip qwen3 thinking tags
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
    # Strip markdown code fences
    m = re.search(r"```(?:json)?\s*(.+?)\s*```", text, re.DOTALL)
    if m:
        text = m.group(1).strip()

    # Try parse as-is
    parsed = None
    try:
        parsed = json.loads(text)
    except (json.JSONDecodeError, ValueError):
        # Try finding the first JSON object or array in the text
        for pattern in (r"\{.*\}", r"\[.*\]"):
            mm = re.search(pattern, text, re.DOTALL)
            if mm:
                try:
                    parsed = json.loads(mm.group(0))
                    break
                except (json.JSONDecodeError, ValueError):
                    continue

    if parsed is None:
        logger.info(
            "LLM batch: 0/%d checks parsed (raw head: %r)",
            len(checks), raw[:120],
        )
        return results

    # Accept both {"results": [...]} (preferred) and bare list
    items = None
    if isinstance(parsed, dict):
        for key in ("results", "checks", "items", "verifications"):
            if isinstance(parsed.get(key), list):
                items = parsed[key]
                break
    elif isinstance(parsed, list):
        items = parsed

    if not items:
        # Final fallback: regex over individual id/found pairs
        for mm in re.finditer(
            r'\{[^}]*"id"\s*:\s*"([^"]+)"[^}]*"found"\s*:\s*(true|false)[^}]*\}',
            raw, re.DOTALL,
        ):
            results[mm.group(1)] = {
                "found": mm.group(2) == "true", "evidence": "",
            }
        logger.info("LLM batch: %d/%d checks parsed (regex fallback)",
                    len(results), len(checks))
        return results

    for item in items:
        if not isinstance(item, dict):
            continue
        cid = item.get("id", "")
        if not cid:
            continue
        results[cid] = {
            "found": bool(item.get("found", False)),
            "evidence": str(item.get("evidence", ""))[:150],
        }

    logger.info("LLM batch: %d/%d checks parsed", len(results), len(checks))
    return results