diff --git a/backend-compliance/compliance/api/agent_doc_check_routes.py b/backend-compliance/compliance/api/agent_doc_check_routes.py index 82a6e81..b149b20 100644 --- a/backend-compliance/compliance/api/agent_doc_check_routes.py +++ b/backend-compliance/compliance/api/agent_doc_check_routes.py @@ -212,6 +212,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]: all_results.append(main_result) # Sub-section checks (auto-detected from headings) + # Pass full doc_text for LLM verification fallback for section in sections: if section["word_count"] < 100: continue @@ -219,6 +220,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]: section["text"], section["doc_type"], section["title"], entry.url, section["word_count"], + full_text=doc_text, ) all_results.append(sub_result) @@ -232,8 +234,16 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]: )] -async def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: int = 0) -> DocCheckResult: - """Run checklist against text, then LLM-verify failed checks.""" +async def _run_checklist( + text: str, doc_type: str, label: str, url: str, + word_count: int = 0, full_text: str = "", +) -> DocCheckResult: + """Run checklist against text, then LLM-verify failed checks. + + Args: + full_text: Optional full document text for LLM verification. + If empty, uses `text` (the section fragment). + """ findings = check_document_completeness(text, doc_type, label, url) all_checks: list[CheckItem] = [] @@ -259,7 +269,7 @@ async def _run_checklist(text: str, doc_type: str, label: str, url: str, word_co try: from compliance.services.doc_checks.llm_verify import verify_failed_checks overturns = await verify_failed_checks( - text, + full_text or text, [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed], label, ) @@ -338,31 +348,30 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]: "word_count": len(sec_text.split()), }) - prev_blank = False for line in lines: stripped = line.strip() + # Only split at headings that classify as a known document type. + # This prevents table content ("Funktionale Cookies", "Typen") + # from triggering section splits. is_heading = ( 5 < len(stripped) < 80 and not stripped.endswith(".") and not stripped.endswith(",") and stripped[0].isupper() - # Require preceding blank line to distinguish real headings - # from table content ("Funktionale Cookies", "Session Cookies") - and prev_blank ) - is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS + classified = _classify_section(stripped) if is_heading else None + is_real_heading = is_heading and classified is not None + is_skip = is_real_heading and stripped.lower().strip() in SKIP_HEADINGS - if is_heading and not is_skip and current_heading: + if is_real_heading and not is_skip and current_heading: _save_section(current_heading, current_text) - if is_heading and not is_skip: + if is_real_heading and not is_skip: current_heading = stripped current_text = [] else: current_text.append(line) - prev_blank = len(stripped) == 0 - # Last section if current_heading: _save_section(current_heading, current_text)