feat: LLM verification for regex FAILs + section-split hardening

Path to 100% correctness: Regex finds 80%, LLM catches the rest. 1. LLM verification (llm_verify.py): - Every regex FAIL is re-checked by Qwen (qwen3:32b) - Binary YES/NO question with evidence extraction - Overturned checks marked with [LLM] prefix in matched_text - Graceful fallback if LLM unavailable 2. Section splitter hardening: - Short lines (<16 chars) only treated as headings if preceded by blank line — prevents table column headers ("Funktion", "Speicherdauer") from splitting cookie sections - Fixes IHK cookie section: 288 words → full section 3. DSFA documentation patterns expanded: - Recognizes "4.) Ergebnis:" numbered result sections - Matches risk assessment conclusions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 15:34:07 +02:00
parent 1d75bbf4eb
commit 4f29e5ff3c
3 changed files with 165 additions and 4 deletions
@@ -202,7 +202,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
            all_results: list[DocCheckResult] = []

            # Main document check (full text against primary type)
-            main_result = _run_checklist(doc_text, entry.doc_type, entry.label, entry.url, word_count)
+            main_result = await _run_checklist(doc_text, entry.doc_type, entry.label, entry.url, word_count)

            # Control Library deep check — DISABLED until doc-check-specific
            # Master Controls with binary pass/fail criteria are available.
@@ -215,7 +215,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
            for section in sections:
                if section["word_count"] < 100:
                    continue
-                sub_result = _run_checklist(
+                sub_result = await _run_checklist(
                    section["text"], section["doc_type"],
                    section["title"], entry.url,
                    section["word_count"],
@@ -232,8 +232,8 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
        )]


-def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: int = 0) -> DocCheckResult:
-    """Run checklist against text and return structured result."""
+async def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: int = 0) -> DocCheckResult:
+    """Run checklist against text, then LLM-verify failed checks."""
    findings = check_document_completeness(text, doc_type, label, url)

    all_checks: list[CheckItem] = []
@@ -253,6 +253,29 @@ def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: i
            completeness = f.get("completeness_pct", 0)
            correctness = f.get("correctness_pct", 0)

+    # LLM verification: re-check regex FAILs to eliminate false positives
+    failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint]
+    if failed:
+        try:
+            from compliance.services.doc_checks.llm_verify import verify_failed_checks
+            overturns = await verify_failed_checks(
+                text,
+                [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
+                label,
+            )
+            for c in all_checks:
+                if c.id in overturns and overturns[c.id]["overturned"]:
+                    c.passed = True
+                    c.matched_text = f"[LLM] {overturns[c.id]['evidence']}"
+                    logger.info("LLM overturned: %s in %s", c.label, label)
+            # Recompute correctness after overturns
+            l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
+            l2_passed = sum(1 for c in l2_active if c.passed)
+            if l2_active:
+                correctness = round(l2_passed / len(l2_active) * 100)
+        except Exception as e:
+            logger.warning("LLM verification skipped: %s", e)
+
    non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
    return DocCheckResult(
        label=label, url=url, doc_type=doc_type,
@@ -315,6 +338,7 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
                "word_count": len(sec_text.split()),
            })

+    prev_blank = False
    for line in lines:
        stripped = line.strip()
        is_heading = (
@@ -322,6 +346,10 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
            and not stripped.endswith(".")
            and not stripped.endswith(",")
            and stripped[0].isupper()
+            # Require preceding blank line OR line > 15 chars to avoid
+            # table column headers ("Funktion", "Speicherdauer") being
+            # treated as section headings
+            and (prev_blank or len(stripped) > 15)
        )
        is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS

@@ -334,6 +362,8 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
        else:
            current_text.append(line)

+        prev_blank = len(stripped) == 0
+
    # Last section
    if current_heading:
        _save_section(current_heading, current_text)