feat: LLM verification for regex FAILs + section-split hardening

Path to 100% correctness: Regex finds 80%, LLM catches the rest. 1. LLM verification (llm_verify.py): - Every regex FAIL is re-checked by Qwen (qwen3:32b) - Binary YES/NO question with evidence extraction - Overturned checks marked with [LLM] prefix in matched_text - Graceful fallback if LLM unavailable 2. Section splitter hardening: - Short lines (<16 chars) only treated as headings if preceded by blank line — prevents table column headers ("Funktion", "Speicherdauer") from splitting cookie sections - Fixes IHK cookie section: 288 words → full section 3. DSFA documentation patterns expanded: - Recognizes "4.) Ergebnis:" numbered result sections - Matches risk assessment conclusions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 15:34:07 +02:00
parent 1d75bbf4eb
commit 4f29e5ff3c
3 changed files with 165 additions and 4 deletions
@@ -0,0 +1,128 @@
+"""
+LLM verification for regex check results.
+
+When a regex check FAILs, the LLM re-checks the original text
+to confirm or overturn the finding. This eliminates false positives
+caused by regex limitations (unusual formatting, synonyms, etc.).
+
+Uses the self-hosted Ollama endpoint (Qwen) for fast local inference.
+"""
+
+import logging
+import os
+import httpx
+
+logger = logging.getLogger(__name__)
+
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+OLLAMA_MODEL = os.getenv("OLLAMA_VERIFY_MODEL", "qwen3:32b")
+TIMEOUT = 30.0
+
+
+async def verify_failed_checks(
+    text: str,
+    failed_checks: list[dict],
+    doc_title: str,
+) -> dict[str, dict]:
+    """Verify regex FAIL results using LLM.
+
+    For each failed check, asks the LLM a binary YES/NO question.
+    Returns a dict mapping check_id -> {"overturned": bool, "evidence": str}.
+
+    Only checks with a "hint" field are verified (hints contain the
+    natural-language question the LLM can answer).
+    """
+    results: dict[str, dict] = {}
+
+    if not failed_checks:
+        return results
+
+    # Truncate text to fit context window
+    text_excerpt = text[:8000]
+
+    for check in failed_checks:
+        check_id = check.get("id", "")
+        label = check.get("label", "")
+        hint = check.get("hint", "")
+
+        if not hint:
+            continue
+
+        try:
+            answer = await _ask_llm(text_excerpt, label, hint, doc_title)
+            overturned = answer.get("found", False)
+            results[check_id] = {
+                "overturned": overturned,
+                "evidence": answer.get("evidence", ""),
+            }
+            if overturned:
+                logger.info(
+                    "LLM overturned regex FAIL for '%s' in '%s': %s",
+                    label, doc_title, answer.get("evidence", "")[:80],
+                )
+        except Exception as e:
+            logger.warning("LLM verify failed for '%s': %s", label, e)
+
+    return results
+
+
+async def _ask_llm(
+    text: str, check_label: str, hint: str, doc_title: str,
+) -> dict:
+    """Ask the LLM a binary verification question."""
+    prompt = f"""/no_think
+Pruefe ob der folgende Dokumenttext die Anforderung erfuellt.
+
+ANFORDERUNG: {check_label}
+DETAILS: {hint}
+DOKUMENT: "{doc_title}"
+
+TEXT:
+{text}
+
+Antworte NUR mit einem JSON-Objekt (keine Erklaerung):
+{{"found": true/false, "evidence": "Zitat aus dem Text das die Anforderung belegt (max 100 Zeichen), oder leer wenn nicht gefunden"}}
+"""
+
+    async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+        resp = await client.post(
+            f"{OLLAMA_URL}/api/generate",
+            json={
+                "model": OLLAMA_MODEL,
+                "prompt": prompt,
+                "stream": False,
+                "options": {"temperature": 0.0, "num_predict": 200},
+            },
+        )
+        resp.raise_for_status()
+        raw = resp.json().get("response", "")
+
+    return _parse_llm_response(raw)
+
+
+def _parse_llm_response(raw: str) -> dict:
+    """Parse LLM JSON response with fallback extraction."""
+    import json
+    import re
+
+    # Try direct JSON parse
+    raw = raw.strip()
+    # Extract JSON from markdown code blocks
+    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
+    if m:
+        raw = m.group(1)
+    # Or just find the JSON object
+    m = re.search(r"\{[^}]*\"found\"[^}]*\}", raw, re.DOTALL)
+    if m:
+        raw = m.group(0)
+
+    try:
+        data = json.loads(raw)
+        return {
+            "found": bool(data.get("found", False)),
+            "evidence": str(data.get("evidence", ""))[:150],
+        }
+    except (json.JSONDecodeError, ValueError):
+        # Fallback: look for "found": true/false
+        found = '"found": true' in raw.lower() or '"found":true' in raw.lower()
+        return {"found": found, "evidence": ""}