"""Per-document regex + MC + LLM checks for the compliance-check route. Each document goes through: 1. regex completeness/correctness checklist 2. Master Control evaluation (all MCs for this doc_type) 3. LLM verification of failed regex checks (overturns where evidence was missed by the regex) 4. Cookie-only: opt-out + privacy-policy URL health-check """ from __future__ import annotations import logging logger = logging.getLogger(__name__) async def _check_single( text: str, doc_type: str, label: str, url: str, word_count: int, use_agent: bool, business_scope: set[str] | None = None, business_profile: dict | None = None, ): """Run regex + MC checks on a single document.""" from compliance.services.doc_checks.runner import check_document_completeness from compliance.services.rag_document_checker import check_document_with_controls from ..agent_doc_check_routes import CheckItem, DocCheckResult # Regex checklist findings = check_document_completeness(text, doc_type, label, url, business_profile=business_profile) all_checks: list[CheckItem] = [] completeness = 0 correctness = 0 for f in findings: if "SCORE" in f.get("code", ""): for c in f.get("all_checks", []): all_checks.append(CheckItem( id=c["id"], label=c["label"], passed=c["passed"], severity=c["severity"], matched_text=c.get("matched_text", ""), level=c.get("level", 1), parent=c.get("parent"), skipped=c.get("skipped", False), hint=c.get("hint", ""), )) completeness = f.get("completeness_pct", 0) correctness = f.get("correctness_pct", 0) # Master Control checks (top 20 by severity to avoid noise) try: # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has # 1874 across 8 types; regex matching is cheap and dominates # well under 1s per doc). Caps remain on the LLM-enrich step # (top-10 FAILs) so cost stays bounded. mc_results = await check_document_with_controls( text, doc_type, label, max_controls=0, use_agent=use_agent, business_scope=business_scope, ) if mc_results: for mc in mc_results: all_checks.append(CheckItem(**mc)) l2 = [c for c in all_checks if c.level == 2 and not c.skipped] l2_passed = sum(1 for c in l2 if c.passed) correctness = round(l2_passed / len(l2) * 100) if l2 else 0 except Exception as e: logger.warning("MC check skipped for %s: %s", label, e) # LLM verification of regex fails failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint] if failed: try: from compliance.services.doc_checks.llm_verify import verify_failed_checks overturns = await verify_failed_checks( text, [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed], label, ) for c in all_checks: if c.id in overturns and overturns[c.id]["overturned"]: c.passed = True c.matched_text = f"[LLM] {overturns[c.id]['evidence']}" l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] l2_passed = sum(1 for c in l2_active if c.passed) if l2_active: correctness = round(l2_passed / len(l2_active) * 100) except Exception as e: logger.warning("LLM verification skipped: %s", e) # Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy # URLs the document advertises. Broken links make individual provider # entries non-compliant under Art. 7(3) DSGVO. if doc_type == "cookie": try: from compliance.services.cookie_link_validator import ( extract_links, validate_links, build_check_items, ) links = extract_links(text) if links: logger.info("Cookie-link validator: %d urls extracted from %s", len(links), label) validated = await validate_links(links) for item in build_check_items(validated): all_checks.append(CheckItem(**item)) # Re-compute correctness with the new L2 items l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] l2_passed = sum(1 for c in l2_active if c.passed) if l2_active: correctness = round(l2_passed / len(l2_active) * 100) except Exception as e: logger.warning("Cookie-link validation skipped for %s: %s", label, e) non_score = [f for f in findings if "SCORE" not in f.get("code", "")] return DocCheckResult( label=label, url=url, doc_type=doc_type, word_count=word_count or len(text.split()), completeness_pct=completeness, correctness_pct=correctness, checks=all_checks, findings_count=len(non_score), )