breakpilot-compliance/backend-compliance/compliance/api/agent_check/_single_check.py

"""Per-document regex + MC + LLM checks for the compliance-check route.

Each document goes through:
  1. regex completeness/correctness checklist
  2. Master Control evaluation (all MCs for this doc_type)
  3. LLM verification of failed regex checks (overturns where evidence
     was missed by the regex)
  4. Cookie-only: opt-out + privacy-policy URL health-check
"""

from __future__ import annotations

import logging

logger = logging.getLogger(__name__)


async def _check_single(
    text: str, doc_type: str, label: str, url: str,
    word_count: int, use_agent: bool,
    business_scope: set[str] | None = None,
    business_profile: dict | None = None,
):
    """Run regex + MC checks on a single document."""
    from compliance.services.doc_checks.runner import check_document_completeness
    from compliance.services.rag_document_checker import check_document_with_controls
    from ..agent_doc_check_routes import CheckItem, DocCheckResult

    # Regex checklist
    findings = check_document_completeness(text, doc_type, label, url,
                                           business_profile=business_profile)

    all_checks: list[CheckItem] = []
    completeness = 0
    correctness = 0

    for f in findings:
        if "SCORE" in f.get("code", ""):
            for c in f.get("all_checks", []):
                all_checks.append(CheckItem(
                    id=c["id"], label=c["label"], passed=c["passed"],
                    severity=c["severity"], matched_text=c.get("matched_text", ""),
                    level=c.get("level", 1), parent=c.get("parent"),
                    skipped=c.get("skipped", False), hint=c.get("hint", ""),
                ))
            completeness = f.get("completeness_pct", 0)
            correctness = f.get("correctness_pct", 0)

    # Master Control checks (top 20 by severity to avoid noise)
    try:
        # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has
        # 1874 across 8 types; regex matching is cheap and dominates
        # well under 1s per doc). Caps remain on the LLM-enrich step
        # (top-10 FAILs) so cost stays bounded.
        mc_results = await check_document_with_controls(
            text, doc_type, label, max_controls=0, use_agent=use_agent,
            business_scope=business_scope,
        )
        if mc_results:
            for mc in mc_results:
                all_checks.append(CheckItem(**mc))
            l2 = [c for c in all_checks if c.level == 2 and not c.skipped]
            l2_passed = sum(1 for c in l2 if c.passed)
            correctness = round(l2_passed / len(l2) * 100) if l2 else 0
    except Exception as e:
        logger.warning("MC check skipped for %s: %s", label, e)

    # LLM verification of regex fails
    failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint]
    if failed:
        try:
            from compliance.services.doc_checks.llm_verify import verify_failed_checks
            overturns = await verify_failed_checks(
                text,
                [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
                label,
            )
            for c in all_checks:
                if c.id in overturns and overturns[c.id]["overturned"]:
                    c.passed = True
                    c.matched_text = f"[LLM] {overturns[c.id]['evidence']}"
            l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
            l2_passed = sum(1 for c in l2_active if c.passed)
            if l2_active:
                correctness = round(l2_passed / len(l2_active) * 100)
        except Exception as e:
            logger.warning("LLM verification skipped: %s", e)

    # Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy
    # URLs the document advertises. Broken links make individual provider
    # entries non-compliant under Art. 7(3) DSGVO.
    if doc_type == "cookie":
        try:
            from compliance.services.cookie_link_validator import (
                extract_links, validate_links, build_check_items,
            )
            links = extract_links(text)
            if links:
                logger.info("Cookie-link validator: %d urls extracted from %s",
                            len(links), label)
                validated = await validate_links(links)
                for item in build_check_items(validated):
                    all_checks.append(CheckItem(**item))
                # Re-compute correctness with the new L2 items
                l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
                l2_passed = sum(1 for c in l2_active if c.passed)
                if l2_active:
                    correctness = round(l2_passed / len(l2_active) * 100)
        except Exception as e:
            logger.warning("Cookie-link validation skipped for %s: %s", label, e)

    non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
    return DocCheckResult(
        label=label, url=url, doc_type=doc_type,
        word_count=word_count or len(text.split()),
        completeness_pct=completeness, correctness_pct=correctness,
        checks=all_checks, findings_count=len(non_score),
    )