"""URL → text fetch helper for the compliance-check pipeline. Tries the consent-tester service first (Playwright, full JS render + CMP capture). On any failure or empty result, falls back to a direct HTTP GET with an identifiable User-Agent and per-domain rate limiting. For cookie/dse/social_media doc types we cap discovery to 1 sub-page (the policy itself is authoritative). For Impressum/AGB/Widerruf and similar enterprise-split pages we follow up to 3 sub-pages. """ from __future__ import annotations import logging import re as _re import httpx from ._constants import CONSENT_TESTER_URL logger = logging.getLogger(__name__) async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]: """Fetch text from URL via consent-tester, with HTTP fallback. Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or HTTP fallback was used. Backend turns payloads into structured vendor records for the VVT table in the email. """ # 1. Consent-tester (Playwright-based, full JS rendering). # max_documents depends on doc_type: # - cookie/dse/social_media: self-extract (often + CMP capture) is # authoritative, sub-pages dilute the policy text. max=1. # - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar # enterprise sites split this across 3-4 short sub-pages # (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows # them. The 15s networkidle bail (dsi_helpers) keeps timing safe. short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"} max_docs = 1 if (doc_type or "") in short_extract_types else 3 try: # P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt # 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit # 120s auch oft an Akamai-Latenz. async with httpx.AsyncClient(timeout=240.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": url, "max_documents": max_docs}, timeout=240.0, ) if resp.status_code == 200: payload = resp.json() docs = payload.get("documents", []) cmp_payloads = payload.get("cmp_payloads") or [] cmp_cookie_text = payload.get("cmp_cookie_text") or "" # D — wenn der consent-tester HTML-Tabellen aus dem DOM # extrahiert hat, in die cmp_payloads als "generic_table" # einschleusen damit das Backend sie via cookies_table_parser # verarbeiten kann. for doc in (docs or []): for tbl in (doc.get("tables") or []): if not tbl or len(tbl) < 3: continue cmp_payloads.append({ "kind": "html_table", "url": doc.get("url", ""), "rows": tbl, }) if docs: texts = [] for doc in docs: t = doc.get("full_text", "") or doc.get("text_preview", "") or "" if t and len(t) > 50: texts.append(t) merged = "\n\n".join(texts) # For cookie/dse/social_media: when CMP reconstruction is # substantially richer than DOM extraction, use it. This # fixes the BMW case where DOM yields ~600 words of # navigation but the ePaaS payload reconstructs to ~1800 # words of actual cookie policy. if (doc_type in short_extract_types and cmp_cookie_text and len(cmp_cookie_text.split()) > len(merged.split())): logger.info( "Preferring CMP-reconstructed text for %s on %s " "(%d words CMP vs %d words DOM)", doc_type, url, len(cmp_cookie_text.split()), len(merged.split()), ) merged = cmp_cookie_text if merged and len(merged.split()) > 100: if len(texts) > 1: logger.info("Merged %d docs from %s (%d words)", len(texts), url, len(merged.split())) return merged, cmp_payloads # P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort- # Schwelle ist, die captured CMP-Payloads NICHT verwerfen. # BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON # (393KB) wurde captured. Backend braucht die fuer # extract_vendors_from_payloads (VVT-Tabelle). if cmp_payloads: logger.info( "P90: keeping %d CMP payloads for %s despite " "short text (%d words) — HTTP fallback runs in parallel", len(cmp_payloads), url, len((merged or cmp_cookie_text).split()), ) fallback_text = merged or cmp_cookie_text or "" return fallback_text, cmp_payloads except Exception as e: # P90: verbose exception fuer Diagnose (war vorher empty) logger.warning("Consent-tester fetch failed for %s: %s (%s)", url, str(e) or "(empty)", type(e).__name__) # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW). # P7: kenntlicher UA + per-Domain Rate-Limit. try: from compliance.services.compliance_user_agent import ( default_request_headers, DomainRateLimiter, ) async with httpx.AsyncClient( timeout=30.0, follow_redirects=True, headers=default_request_headers(), ) as client: async with DomainRateLimiter(url): resp = await client.get(url) if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""): html = resp.text # Strip HTML tags, decode entities text = _re.sub(r"]*>.*?", " ", html, flags=_re.DOTALL | _re.IGNORECASE) text = _re.sub(r"]*>.*?", " ", text, flags=_re.DOTALL | _re.IGNORECASE) text = _re.sub(r"<[^>]+>", " ", text) text = _re.sub(r"\s+", " ", text).strip() if len(text.split()) > 100: logger.info("HTTP fallback for %s: %d words", url, len(text.split())) return text, [] except Exception as e: logger.warning("HTTP fallback failed for %s: %s", url, e) return "", []