fix: Section splitter only splits at classified headings + LLM gets full text

Two critical fixes: 1. Section splitter: Only lines that classify as a known doc_type (cookie, social_media, dsfa, etc.) trigger section splits. Random short lines ("Typen", "Funktionale Cookies") no longer split sections — they all had blank lines before them in the extracted HTML text. 2. LLM verification: Sub-section checks now pass the full document text to the LLM, not just the section fragment. This lets the LLM find content that the section splitter missed. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 16:28:17 +02:00
parent a1b9273649
commit a4b75dc6b1
1 changed files with 21 additions and 12 deletions
@@ -212,6 +212,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
            all_results.append(main_result)

            # Sub-section checks (auto-detected from headings)
+            # Pass full doc_text for LLM verification fallback
            for section in sections:
                if section["word_count"] < 100:
                    continue
@@ -219,6 +220,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
                    section["text"], section["doc_type"],
                    section["title"], entry.url,
                    section["word_count"],
+                    full_text=doc_text,
                )
                all_results.append(sub_result)

@@ -232,8 +234,16 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
        )]


-async def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: int = 0) -> DocCheckResult:
-    """Run checklist against text, then LLM-verify failed checks."""
+async def _run_checklist(
+    text: str, doc_type: str, label: str, url: str,
+    word_count: int = 0, full_text: str = "",
+) -> DocCheckResult:
+    """Run checklist against text, then LLM-verify failed checks.
+
+    Args:
+        full_text: Optional full document text for LLM verification.
+                   If empty, uses `text` (the section fragment).
+    """
    findings = check_document_completeness(text, doc_type, label, url)

    all_checks: list[CheckItem] = []
@@ -259,7 +269,7 @@ async def _run_checklist(text: str, doc_type: str, label: str, url: str, word_co
        try:
            from compliance.services.doc_checks.llm_verify import verify_failed_checks
            overturns = await verify_failed_checks(
-                text,
+                full_text or text,
                [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
                label,
            )
@@ -338,31 +348,30 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
                "word_count": len(sec_text.split()),
            })

-    prev_blank = False
    for line in lines:
        stripped = line.strip()
+        # Only split at headings that classify as a known document type.
+        # This prevents table content ("Funktionale Cookies", "Typen")
+        # from triggering section splits.
        is_heading = (
            5 < len(stripped) < 80
            and not stripped.endswith(".")
            and not stripped.endswith(",")
            and stripped[0].isupper()
-            # Require preceding blank line to distinguish real headings
-            # from table content ("Funktionale Cookies", "Session Cookies")
-            and prev_blank
        )
-        is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS
+        classified = _classify_section(stripped) if is_heading else None
+        is_real_heading = is_heading and classified is not None
+        is_skip = is_real_heading and stripped.lower().strip() in SKIP_HEADINGS

-        if is_heading and not is_skip and current_heading:
+        if is_real_heading and not is_skip and current_heading:
            _save_section(current_heading, current_text)

-        if is_heading and not is_skip:
+        if is_real_heading and not is_skip:
            current_heading = stripped
            current_text = []
        else:
            current_text.append(line)

-        prev_blank = len(stripped) == 0
-
    # Last section
    if current_heading:
        _save_section(current_heading, current_text)