fix: Section splitter only splits at classified headings + LLM gets full text
Build + Deploy / build-admin-compliance (push) Successful in 2m33s
Build + Deploy / build-ai-sdk (push) Successful in 57s
Build + Deploy / build-developer-portal (push) Successful in 1m23s
Build + Deploy / build-tts (push) Successful in 1m33s
Build + Deploy / build-backend-compliance (push) Successful in 3m34s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
Build + Deploy / build-document-crawler (push) Successful in 40s
Build + Deploy / build-dsms-gateway (push) Successful in 26s
Build + Deploy / build-dsms-node (push) Successful in 11s
CI / loc-budget (push) Failing after 23s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m31s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 1m2s
CI / test-python-backend (push) Successful in 46s
CI / test-python-document-crawler (push) Successful in 32s
CI / test-python-dsms-gateway (push) Successful in 26s
CI / validate-canonical-controls (push) Successful in 17s
Build + Deploy / trigger-orca (push) Successful in 3m23s
Build + Deploy / build-admin-compliance (push) Successful in 2m33s
Build + Deploy / build-ai-sdk (push) Successful in 57s
Build + Deploy / build-developer-portal (push) Successful in 1m23s
Build + Deploy / build-tts (push) Successful in 1m33s
Build + Deploy / build-backend-compliance (push) Successful in 3m34s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
Build + Deploy / build-document-crawler (push) Successful in 40s
Build + Deploy / build-dsms-gateway (push) Successful in 26s
Build + Deploy / build-dsms-node (push) Successful in 11s
CI / loc-budget (push) Failing after 23s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m31s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 1m2s
CI / test-python-backend (push) Successful in 46s
CI / test-python-document-crawler (push) Successful in 32s
CI / test-python-dsms-gateway (push) Successful in 26s
CI / validate-canonical-controls (push) Successful in 17s
Build + Deploy / trigger-orca (push) Successful in 3m23s
Two critical fixes:
1. Section splitter: Only lines that classify as a known doc_type
(cookie, social_media, dsfa, etc.) trigger section splits.
Random short lines ("Typen", "Funktionale Cookies") no longer
split sections — they all had blank lines before them in the
extracted HTML text.
2. LLM verification: Sub-section checks now pass the full document
text to the LLM, not just the section fragment. This lets the
LLM find content that the section splitter missed.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -212,6 +212,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
|
||||
all_results.append(main_result)
|
||||
|
||||
# Sub-section checks (auto-detected from headings)
|
||||
# Pass full doc_text for LLM verification fallback
|
||||
for section in sections:
|
||||
if section["word_count"] < 100:
|
||||
continue
|
||||
@@ -219,6 +220,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
|
||||
section["text"], section["doc_type"],
|
||||
section["title"], entry.url,
|
||||
section["word_count"],
|
||||
full_text=doc_text,
|
||||
)
|
||||
all_results.append(sub_result)
|
||||
|
||||
@@ -232,8 +234,16 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
|
||||
)]
|
||||
|
||||
|
||||
async def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: int = 0) -> DocCheckResult:
|
||||
"""Run checklist against text, then LLM-verify failed checks."""
|
||||
async def _run_checklist(
|
||||
text: str, doc_type: str, label: str, url: str,
|
||||
word_count: int = 0, full_text: str = "",
|
||||
) -> DocCheckResult:
|
||||
"""Run checklist against text, then LLM-verify failed checks.
|
||||
|
||||
Args:
|
||||
full_text: Optional full document text for LLM verification.
|
||||
If empty, uses `text` (the section fragment).
|
||||
"""
|
||||
findings = check_document_completeness(text, doc_type, label, url)
|
||||
|
||||
all_checks: list[CheckItem] = []
|
||||
@@ -259,7 +269,7 @@ async def _run_checklist(text: str, doc_type: str, label: str, url: str, word_co
|
||||
try:
|
||||
from compliance.services.doc_checks.llm_verify import verify_failed_checks
|
||||
overturns = await verify_failed_checks(
|
||||
text,
|
||||
full_text or text,
|
||||
[{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
|
||||
label,
|
||||
)
|
||||
@@ -338,31 +348,30 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
|
||||
"word_count": len(sec_text.split()),
|
||||
})
|
||||
|
||||
prev_blank = False
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
# Only split at headings that classify as a known document type.
|
||||
# This prevents table content ("Funktionale Cookies", "Typen")
|
||||
# from triggering section splits.
|
||||
is_heading = (
|
||||
5 < len(stripped) < 80
|
||||
and not stripped.endswith(".")
|
||||
and not stripped.endswith(",")
|
||||
and stripped[0].isupper()
|
||||
# Require preceding blank line to distinguish real headings
|
||||
# from table content ("Funktionale Cookies", "Session Cookies")
|
||||
and prev_blank
|
||||
)
|
||||
is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS
|
||||
classified = _classify_section(stripped) if is_heading else None
|
||||
is_real_heading = is_heading and classified is not None
|
||||
is_skip = is_real_heading and stripped.lower().strip() in SKIP_HEADINGS
|
||||
|
||||
if is_heading and not is_skip and current_heading:
|
||||
if is_real_heading and not is_skip and current_heading:
|
||||
_save_section(current_heading, current_text)
|
||||
|
||||
if is_heading and not is_skip:
|
||||
if is_real_heading and not is_skip:
|
||||
current_heading = stripped
|
||||
current_text = []
|
||||
else:
|
||||
current_text.append(line)
|
||||
|
||||
prev_blank = len(stripped) == 0
|
||||
|
||||
# Last section
|
||||
if current_heading:
|
||||
_save_section(current_heading, current_text)
|
||||
|
||||
Reference in New Issue
Block a user