breakpilot-compliance/backend-compliance/compliance/api/agent_check/_fetch.py

"""URL → text fetch helper for the compliance-check pipeline.

Tries the consent-tester service first (Playwright, full JS render +
CMP capture). On any failure or empty result, falls back to a direct
HTTP GET with an identifiable User-Agent and per-domain rate limiting.

For cookie/dse/social_media doc types we cap discovery to 1 sub-page
(the policy itself is authoritative). For Impressum/AGB/Widerruf and
similar enterprise-split pages we follow up to 3 sub-pages.
"""

from __future__ import annotations

import logging
import re as _re

import httpx

from ._constants import CONSENT_TESTER_URL

logger = logging.getLogger(__name__)


async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
    """Fetch text from URL via consent-tester, with HTTP fallback.

    Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
    during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
    HTTP fallback was used. Backend turns payloads into structured vendor
    records for the VVT table in the email.
    """
    # 1. Consent-tester (Playwright-based, full JS rendering).
    # max_documents depends on doc_type:
    #   - cookie/dse/social_media: self-extract (often + CMP capture) is
    #     authoritative, sub-pages dilute the policy text. max=1.
    #   - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
    #     enterprise sites split this across 3-4 short sub-pages
    #     (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
    #     them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
    short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
    max_docs = 1 if (doc_type or "") in short_extract_types else 3
    try:
        # P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt
        # 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit
        # 120s auch oft an Akamai-Latenz.
        async with httpx.AsyncClient(timeout=240.0) as client:
            resp = await client.post(
                f"{CONSENT_TESTER_URL}/dsi-discovery",
                json={"url": url, "max_documents": max_docs},
                timeout=240.0,
            )
            if resp.status_code == 200:
                payload = resp.json()
                docs = payload.get("documents", [])
                cmp_payloads = payload.get("cmp_payloads") or []
                cmp_cookie_text = payload.get("cmp_cookie_text") or ""
                # D — wenn der consent-tester HTML-Tabellen aus dem DOM
                # extrahiert hat, in die cmp_payloads als "generic_table"
                # einschleusen damit das Backend sie via cookies_table_parser
                # verarbeiten kann.
                for doc in (docs or []):
                    for tbl in (doc.get("tables") or []):
                        if not tbl or len(tbl) < 3:
                            continue
                        cmp_payloads.append({
                            "kind": "html_table",
                            "url":  doc.get("url", ""),
                            "rows": tbl,
                        })
                if docs:
                    texts = []
                    for doc in docs:
                        t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
                        if t and len(t) > 50:
                            texts.append(t)
                    merged = "\n\n".join(texts)
                    # For cookie/dse/social_media: when CMP reconstruction is
                    # substantially richer than DOM extraction, use it. This
                    # fixes the BMW case where DOM yields ~600 words of
                    # navigation but the ePaaS payload reconstructs to ~1800
                    # words of actual cookie policy.
                    if (doc_type in short_extract_types
                            and cmp_cookie_text
                            and len(cmp_cookie_text.split()) > len(merged.split())):
                        logger.info(
                            "Preferring CMP-reconstructed text for %s on %s "
                            "(%d words CMP vs %d words DOM)",
                            doc_type, url,
                            len(cmp_cookie_text.split()),
                            len(merged.split()),
                        )
                        merged = cmp_cookie_text
                    if merged and len(merged.split()) > 100:
                        if len(texts) > 1:
                            logger.info("Merged %d docs from %s (%d words)",
                                        len(texts), url, len(merged.split()))
                        return merged, cmp_payloads
                # P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort-
                # Schwelle ist, die captured CMP-Payloads NICHT verwerfen.
                # BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON
                # (393KB) wurde captured. Backend braucht die fuer
                # extract_vendors_from_payloads (VVT-Tabelle).
                if cmp_payloads:
                    logger.info(
                        "P90: keeping %d CMP payloads for %s despite "
                        "short text (%d words) — HTTP fallback runs in parallel",
                        len(cmp_payloads), url,
                        len((merged or cmp_cookie_text).split()),
                    )
                    fallback_text = merged or cmp_cookie_text or ""
                    return fallback_text, cmp_payloads
    except Exception as e:
        # P90: verbose exception fuer Diagnose (war vorher empty)
        logger.warning("Consent-tester fetch failed for %s: %s (%s)",
                       url, str(e) or "(empty)", type(e).__name__)

    # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
    # P7: kenntlicher UA + per-Domain Rate-Limit.
    try:
        from compliance.services.compliance_user_agent import (
            default_request_headers, DomainRateLimiter,
        )
        async with httpx.AsyncClient(
            timeout=30.0, follow_redirects=True,
            headers=default_request_headers(),
        ) as client:
            async with DomainRateLimiter(url):
                resp = await client.get(url)
            if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
                html = resp.text
                # Strip HTML tags, decode entities
                text = _re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
                text = _re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
                text = _re.sub(r"<[^>]+>", " ", text)
                text = _re.sub(r"\s+", " ", text).strip()
                if len(text.split()) > 100:
                    logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
                    return text, []
    except Exception as e:
        logger.warning("HTTP fallback failed for %s: %s", url, e)

    return "", []